release: prepare for 3.0.1 by hagitsegev

mutlishard_mutation_query(): use correct reader concurrency semaphore
The multishard mutation query used the semaphore obtained from `database::user_read_concurrency_sem()` to pause-resume shard readers. This presented a problem when `multishard_mutation_query()` was reading from system tables. In this case the readers themselves would obtain their permits from the system read concurrency semaphore. Since the pausing of shard readers used the user read semaphore, pausing failed to fulfill its objective of alleviating pressure on the semaphore the reads obtained their permits from. In some cases this lead to a deadlock during system reads. To ensure the correct semaphore is used for pausing-resuming readers, obtain the semaphore from the `table` object. To avoid looking up the table on every pause or resume call, cache the semaphores when readers are created. Fixes: #4096 Signed-off-by: Botond Dénes <bdenes@scylladb.com> Message-Id: <c784a3cd525ce29642d7216fbe92638fa7884e88.1547729119.git.bdenes@scylladb.com> (cherry picked from commit 4537ec7426)
2019-01-20 12:42:00 +02:00 · 2019-01-17 18:08:01 +02:00 · 2019-01-17 18:07:41 +02:00 · 2019-01-15 21:16:13 +02:00 · 2019-01-14 16:59:40 +02:00 · 2019-01-12 22:24:08 +02:00
338 changed files with 11090 additions and 4892 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
@@ -9,3 +9,6 @@
 [submodule "xxHash"]
 	path = xxHash
 	url = ../xxHash
+[submodule "libdeflate"]
+	path = libdeflate
+	url = ../libdeflate
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,4 +138,5 @@ target_include_directories(scylla PUBLIC
        ${SEASTAR_INCLUDE_DIRS}
        ${Boost_INCLUDE_DIRS}
        xxhash
+        libdeflate
        build/release/gen)
--- a/README.md
+++ b/README.md
@@ -50,12 +50,12 @@ Then, to build an RPM, run:
 ./dist/redhat/build_rpm.sh
 ```

-The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
+The built RPM is stored in the ``build/mock/<configuration>/result`` directory.
 For example, on Fedora 21 mock reports the following:

 ```
 INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
-INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
+INFO: Results and/or logs in: build/mock/fedora-21-x86_64/result
 ```

 ## Building Fedora-based Docker image
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=3.0.1

 if test -f version
 then
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -2228,11 +2228,11 @@
               "description":"The column family"
            },
            "total":{
-               "type":"int",
+               "type":"long",
               "description":"The total snapshot size"
            },
            "live":{
-               "type":"int",
+               "type":"long",
               "description":"The live snapshot size"
            }
         }
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -87,11 +87,17 @@ future<> create_metadata_table_if_missing(
    return mm.announce_new_column_family(b.build(), false);
 }

-future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db) {
+future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
    static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };

-    return do_until([&db] { return db.get_version() != database::empty_version; }, pause).then([&mm] {
-        return do_until([&mm] { return mm.have_schema_agreement(); }, pause);
+    return do_until([&db, &as] {
+        as.check();
+        return db.get_version() != database::empty_version;
+    }, pause).then([&mm, &as] {
+        return do_until([&mm, &as] {
+            as.check();
+            return mm.have_schema_agreement();
+        }, pause);
    });
 }

--- a/auth/common.hh
+++ b/auth/common.hh
@@ -81,7 +81,7 @@ future<> create_metadata_table_if_missing(
        stdx::string_view cql,
        ::service::migration_manager&);

-future<> wait_for_schema_agreement(::service::migration_manager&, const database&);
+future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);

 ///
 /// Time-outs for internal, non-local CQL queries.
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -160,7 +160,7 @@ future<> default_authorizer::start() {
                _migration_manager).then([this] {
            _finished = do_after_system_ready(_as, [this] {
                return async([this] {
-                    wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                    if (legacy_metadata_exists()) {
                        if (!any_granted().get0()) {
@@ -178,7 +178,7 @@ future<> default_authorizer::start() {

 future<> default_authorizer::stop() {
    _as.request_abort();
-    return _finished.handle_exception_type([](const sleep_aborted&) {});
+    return _finished.handle_exception_type([](const sleep_aborted&) {}).handle_exception_type([](const abort_requested_exception&) {});
 }

 future<permission_set>
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -157,7 +157,7 @@ future<> password_authenticator::start() {

         _stopped = do_after_system_ready(_as, [this] {
             return async([this] {
-                 wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                 wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                 if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
                     if (legacy_metadata_exists()) {
@@ -182,7 +182,7 @@ future<> password_authenticator::start() {

 future<> password_authenticator::stop() {
    _as.request_abort();
-    return _stopped.handle_exception_type([] (const sleep_aborted&) { });
+    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
 }

 db::consistency_level password_authenticator::consistency_for_user(stdx::string_view role_name) {
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -227,7 +227,7 @@ future<> standard_role_manager::start() {
        return this->create_metadata_tables_if_missing().then([this] {
            _stopped = auth::do_after_system_ready(_as, [this] {
                return seastar::async([this] {
-                    wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                    if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
                        if (this->legacy_metadata_exists()) {
@@ -251,7 +251,7 @@ future<> standard_role_manager::start() {

 future<> standard_role_manager::stop() {
    _as.request_abort();
-    return _stopped.handle_exception_type([] (const sleep_aborted&) { });
+    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});;
 }

 future<> standard_role_manager::create_or_replace(stdx::string_view role_name, const role_config& c) const {
--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -77,7 +77,7 @@ protected:
        , _io_priority(iop)
        , _interval(interval)
        , _update_timer([this] { adjust(); })
-        , _control_points({{0,0}})
+        , _control_points()
        , _current_backlog(std::move(backlog))
        , _inflight_update(make_ready_future<>())
    {
@@ -125,7 +125,7 @@ public:
    flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
    flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
        : backlog_controller(sg, iop, std::move(interval),
-          std::vector<backlog_controller::control_point>({{soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
+          std::vector<backlog_controller::control_point>({{0.0, 0.0}, {soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
          std::move(current_dirty)
        )
    {}
@@ -139,7 +139,7 @@ public:
    compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
    compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, std::function<float()> current_backlog)
        : backlog_controller(sg, iop, std::move(interval),
-          std::vector<backlog_controller::control_point>({{0.5, 10}, {1.5, 100} , {normalization_factor, 1000}}),
+          std::vector<backlog_controller::control_point>({{0.0, 50}, {1.5, 100} , {normalization_factor, 1000}}),
          std::move(current_backlog)
        )
    {}
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -57,12 +57,12 @@ private:
        value_type data[0];
        void operator delete(void* ptr) { free(ptr); }
    };
-    // FIXME: consider increasing chunk size as the buffer grows
-    static constexpr size_type chunk_size{512};
+    static constexpr size_type default_chunk_size{512};
 private:
    std::unique_ptr<chunk> _begin;
    chunk* _current;
    size_type _size;
+    size_type _initial_chunk_size = default_chunk_size;
 public:
    class fragment_iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
        chunk* _current = nullptr;
@@ -102,13 +102,13 @@ private:
    }
    // Figure out next chunk size.
    //   - must be enough for data_size
-    //   - must be at least chunk_size
+    //   - must be at least _initial_chunk_size
    //   - try to double each time to prevent too many allocations
    //   - do not exceed max_chunk_size
    size_type next_alloc_size(size_t data_size) const {
        auto next_size = _current
                ? _current->size * 2
-                : chunk_size;
+                : _initial_chunk_size;
        next_size = std::min(next_size, max_chunk_size());
        // FIXME: check for overflow?
        return std::max<size_type>(next_size, data_size + sizeof(chunk));
@@ -116,13 +116,19 @@ private:
    // Makes room for a contiguous region of given size.
    // The region is accounted for as already written.
    // size must not be zero.
+    [[gnu::always_inline]]
    value_type* alloc(size_type size) {
-        if (size <= current_space_left()) {
+        if (__builtin_expect(size <= current_space_left(), true)) {
            auto ret = _current->data + _current->offset;
            _current->offset += size;
            _size += size;
            return ret;
        } else {
+            return alloc_new(size);
+        }
+    }
+    [[gnu::noinline]]
+    value_type* alloc_new(size_type size) {
            auto alloc_size = next_alloc_size(size);
            auto space = malloc(alloc_size);
            if (!space) {
@@ -140,19 +146,22 @@ private:
            }
            _size += size;
            return _current->data;
-        };
    }
 public:
-    bytes_ostream() noexcept
+    explicit bytes_ostream(size_t initial_chunk_size) noexcept
        : _begin()
        , _current(nullptr)
        , _size(0)
+        , _initial_chunk_size(initial_chunk_size)
    { }

+    bytes_ostream() noexcept : bytes_ostream(default_chunk_size) {}
+
    bytes_ostream(bytes_ostream&& o) noexcept
        : _begin(std::move(o._begin))
        , _current(o._current)
        , _size(o._size)
+        , _initial_chunk_size(o._initial_chunk_size)
    {
        o._current = nullptr;
        o._size = 0;
@@ -162,6 +171,7 @@ public:
        : _begin()
        , _current(nullptr)
        , _size(0)
+        , _initial_chunk_size(o._initial_chunk_size)
    {
        append(o);
    }
@@ -199,18 +209,20 @@ public:
        return place_holder<T>{alloc(sizeof(T))};
    }

+    [[gnu::always_inline]]
    value_type* write_place_holder(size_type size) {
        return alloc(size);
    }

    // Writes given sequence of bytes
+    [[gnu::always_inline]]
    inline void write(bytes_view v) {
        if (v.empty()) {
            return;
        }

        auto this_size = std::min(v.size(), size_t(current_space_left()));
-        if (this_size) {
+        if (__builtin_expect(this_size, true)) {
            memcpy(_current->data + _current->offset, v.begin(), this_size);
            _current->offset += this_size;
            _size += this_size;
@@ -219,11 +231,12 @@ public:

        while (!v.empty()) {
            auto this_size = std::min(v.size(), size_t(max_chunk_size()));
-            std::copy_n(v.begin(), this_size, alloc(this_size));
+            std::copy_n(v.begin(), this_size, alloc_new(this_size));
            v.remove_prefix(this_size);
        }
    }

+    [[gnu::always_inline]]
    void write(const char* ptr, size_t size) {
        write(bytes_view(reinterpret_cast<const signed char*>(ptr), size));
    }
@@ -393,6 +406,21 @@ public:
    bool operator!=(const bytes_ostream& other) const {
        return !(*this == other);
    }
+
+    // Makes this instance empty.
+    //
+    // The first buffer is not deallocated, so callers may rely on the
+    // fact that if they write less than the initial chunk size between
+    // the clear() calls then writes will not involve any memory allocations,
+    // except for the first write made on this instance.
+    void clear() {
+        if (_begin) {
+            _begin->offset = 0;
+            _size = 0;
+            _current = _begin.get();
+            _begin->next.reset();
+        }
+    }
 };

 template<>
--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -200,8 +200,9 @@ public:
        return _current_start;
    }

-    position_in_partition_view upper_bound() const {
-        return _current_end;
+    // Returns the upper bound of the last range in provided ranges set
+    position_in_partition_view uppermost_bound() const {
+        return position_in_partition_view::for_range_end(_ranges.back());
    }

    // When lower_bound() changes, this also does
--- a/compress.cc
+++ b/compress.cc
@@ -112,7 +112,7 @@ const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_kb";
 const sstring compression_parameters::CRC_CHECK_CHANCE = "crc_check_chance";

 compression_parameters::compression_parameters()
-    : compression_parameters(nullptr)
+    : compression_parameters(compressor::lz4)
 {}

 compression_parameters::~compression_parameters()
--- a/compress.hh
+++ b/compress.hh
@@ -118,6 +118,10 @@ public:
    std::map<sstring, sstring> get_options() const;
    bool operator==(const compression_parameters& other) const;
    bool operator!=(const compression_parameters& other) const;
+
+    static compression_parameters no_compression() {
+        return compression_parameters(nullptr);
+    }
 private:
    void validate_options(const std::map<sstring, sstring>&);
 };
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -242,6 +242,9 @@ batch_size_fail_threshold_in_kb: 50

 # The directory where hints files are stored if hinted handoff is enabled.
 # hints_directory: /var/lib/scylla/hints
+ 
+# The directory where hints files are stored for materialized-view updates
+# view_hints_directory: /var/lib/scylla/view_hints

 # See http://wiki.apache.org/cassandra/HintedHandoff
 # May either be "true" or "false" to enable globally, or contain a list
--- a/configure.py
+++ b/configure.py
@@ -197,7 +197,9 @@ class Thrift(object):

 def default_target_arch():
    if platform.machine() in ['i386', 'i686', 'x86_64']:
-        return 'nehalem'
+        return 'westmere'   # support PCLMUL
+    elif platform.machine() == 'aarch64':
+        return 'armv8-a+crc+crypto'
    else:
        return ''

@@ -271,6 +273,7 @@ scylla_tests = [
    'tests/perf/perf_sstable',
    'tests/cql_query_test',
    'tests/secondary_index_test',
+    'tests/filtering_test',
    'tests/storage_proxy_test',
    'tests/schema_change_test',
    'tests/mutation_reader_test',
@@ -306,6 +309,7 @@ scylla_tests = [
    'tests/log_heap_test',
    'tests/managed_vector_test',
    'tests/crc_test',
+    'tests/checksum_utils_test',
    'tests/flush_queue_test',
    'tests/dynamic_bitset_test',
    'tests/auth_test',
@@ -356,6 +360,7 @@ scylla_tests = [

 perf_tests = [
    'tests/perf/perf_mutation_readers',
+    'tests/perf/perf_checksum',
    'tests/perf/perf_mutation_fragment',
    'tests/perf/perf_idl',
 ]
@@ -431,6 +436,7 @@ extra_cxxflags = {}
 cassandra_interface = Thrift(source='interface/cassandra.thrift', service='Cassandra')

 scylla_core = (['database.cc',
+                'table.cc',
                'atomic_cell.cc',
                'schema.cc',
                'frozen_schema.cc',
@@ -461,6 +467,7 @@ scylla_core = (['database.cc',
                'compress.cc',
                'sstables/mp_row_consumer.cc',
                'sstables/sstables.cc',
+                'sstables/mc/writer.cc',
                'sstables/sstable_version.cc',
                'sstables/compress.cc',
                'sstables/row.cc',
@@ -470,7 +477,6 @@ scylla_core = (['database.cc',
                'sstables/compaction_manager.cc',
                'sstables/integrity_checked_file_impl.cc',
                'sstables/prepended_input_stream.cc',
-                'sstables/m_format_write_helpers.cc',
                'sstables/m_format_read_helpers.cc',
                'transport/event.cc',
                'transport/event_notifier.cc',
@@ -579,6 +585,7 @@ scylla_core = (['database.cc',
                'db/marshal/type_parser.cc',
                'db/batchlog_manager.cc',
                'db/view/view.cc',
+                'db/view/view_update_from_staging_generator.cc',
                'db/view/row_locking.cc',
                'index/secondary_index_manager.cc',
                'index/secondary_index.cc',
@@ -592,6 +599,7 @@ scylla_core = (['database.cc',
                'utils/managed_bytes.cc',
                'utils/exceptions.cc',
                'utils/config_file.cc',
+                'utils/gz/crc_combine.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -682,6 +690,7 @@ scylla_core = (['database.cc',
                'data/cell.cc',
                'multishard_writer.cc',
                'multishard_mutation_query.cc',
+                'reader_concurrency_semaphore.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
               )

@@ -744,6 +753,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/tracing.idl.hh',
        'idl/consistency_level.idl.hh',
        'idl/cache_temperature.idl.hh',
+        'idl/view.idl.hh',
        ]

 scylla_tests_dependencies = scylla_core + idls + [
@@ -773,6 +783,7 @@ pure_boost_tests = set([
    'tests/test-serialization',
    'tests/range_test',
    'tests/crc_test',
+    'tests/checksum_utils_test',
    'tests/managed_vector_test',
    'tests/dynamic_bitset_test',
    'tests/idl_test',
@@ -1001,6 +1012,8 @@ seastar_ldflags = args.user_ldflags
 seastar_flags += ['--compiler', args.cxx, '--c-compiler', args.cc, '--cflags=%s' % (seastar_cflags), '--ldflags=%s' % (seastar_ldflags),
                  '--c++-dialect=gnu++1z', '--optflags=%s' % (modes['release']['opt']), ]

+libdeflate_cflags = seastar_cflags
+
 status = subprocess.call([args.python, './configure.py'] + seastar_flags, cwd='seastar')

 if status != 0:
@@ -1100,6 +1113,9 @@ with open(buildfile, 'w') as f:
            command = {ninja} -C $subdir $target
            restat = 1
            description = NINJA $out
+        rule run
+            command = $in > $out
+            description = GEN $out
        rule copy
            command = cp $in $out
            description = COPY $out
@@ -1172,6 +1188,10 @@ with open(buildfile, 'w') as f:
            if binary.endswith('.a'):
                f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
            else:
+                objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
+                    'libdeflate/libdeflate.a'
+                ]])
+                objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
                if binary.startswith('tests/'):
                    local_libs = '$libs'
                    if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
@@ -1213,6 +1233,12 @@ with open(buildfile, 'w') as f:
                    antlr3_grammars.add(src)
                else:
                    raise Exception('No rule for ' + src)
+        compiles['$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o'] = '$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc'
+        compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
+        f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
+                                            '$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
+        f.write('build {}: link.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
+                                                '$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
        for obj in compiles:
            src = compiles[obj]
            gen_headers = list(ragels.keys())
@@ -1262,6 +1288,10 @@ with open(buildfile, 'w') as f:
            ''').format(**locals()))
        f.write('build build/$mode/scylla-package.tar: package build/{mode}/scylla build/{mode}/iotune\n'.format(**locals()))
        f.write('    mode = {mode}\n'.format(**locals()))
+        f.write('rule libdeflate.{mode}\n'.format(**locals()))
+        f.write('    command = make -C libdeflate BUILD_DIR=../build/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc}\n'.format(**locals()))
+        f.write('build build/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
+
    f.write('build {}: phony\n'.format(seastar_deps))
    f.write(textwrap.dedent('''\
        rule configure
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -38,44 +38,44 @@ private:
    static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
        return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
    }
+    static atomic_cell upgrade_cell(const abstract_type& new_type, const abstract_type& old_type, atomic_cell_view cell,
+                                    atomic_cell::collection_member cm = atomic_cell::collection_member::no) {
+        if (cell.is_live() && !old_type.is_counter()) {
+            if (cell.is_live_and_has_ttl()) {
+                return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl(), cm);
+            }
+            return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cm);
+        } else {
+            return atomic_cell(new_type, cell);
+        }
+    }
    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
        if (!is_compatible(new_def, old_type, kind) || cell.timestamp() <= new_def.dropped_at()) {
            return;
        }
-        auto new_cell = [&] {
-            if (cell.is_live() && !old_type->is_counter()) {
-                if (cell.is_live_and_has_ttl()) {
-                    return atomic_cell_or_collection(
-                        atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl())
-                    );
-                }
-                return atomic_cell_or_collection(
-                    atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize())
-                );
-            } else {
-                return atomic_cell_or_collection(*new_def.type, cell);
-            }
-        }();
-        dst.apply(new_def, std::move(new_cell));
+        dst.apply(new_def, upgrade_cell(*new_def.type, *old_type, cell));
    }
    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
        if (!is_compatible(new_def, old_type, kind)) {
            return;
        }
      cell.data.with_linearized([&] (bytes_view cell_bv) {
-        auto&& ctype = static_pointer_cast<const collection_type_impl>(old_type);
-        auto old_view = ctype->deserialize_mutation_form(cell_bv);
+        auto new_ctype = static_pointer_cast<const collection_type_impl>(new_def.type);
+        auto old_ctype = static_pointer_cast<const collection_type_impl>(old_type);
+        auto old_view = old_ctype->deserialize_mutation_form(cell_bv);

-        collection_type_impl::mutation_view new_view;
+        collection_type_impl::mutation new_view;
        if (old_view.tomb.timestamp > new_def.dropped_at()) {
            new_view.tomb = old_view.tomb;
        }
        for (auto& c : old_view.cells) {
            if (c.second.timestamp() > new_def.dropped_at()) {
-                new_view.cells.emplace_back(std::move(c));
+                new_view.cells.emplace_back(c.first, upgrade_cell(*new_ctype->value_comparator(), *old_ctype->value_comparator(), c.second, atomic_cell::collection_member::yes));
            }
        }
-        dst.apply(new_def, ctype->serialize_mutation_form(std::move(new_view)));
+        if (new_view.tomb || !new_view.cells.empty()) {
+            dst.apply(new_def, new_ctype->serialize_mutation_form(std::move(new_view)));
+        }
      });
    }
 public:
--- a/cql3/error_collector.hh
+++ b/cql3/error_collector.hh
@@ -67,6 +67,12 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
     */
    const sstring_view _query;

+    /**
+     * An empty bitset to be used as a workaround for AntLR null dereference
+     * bug.
+     */
+    static typename ExceptionBaseType::BitsetListType _empty_bit_list;
+
 public:

    /**
@@ -144,6 +150,14 @@ private:
            break;
        }
        default:
+            // AntLR Exception class has a bug of dereferencing a null
+            // pointer in the displayRecognitionError. The following
+            // if statement makes sure it will not be null before the
+            // call to that function (displayRecognitionError).
+            // bug reference: https://github.com/antlr/antlr3/issues/191
+            if (!ex->get_expectingSet()) {
+                ex->set_expectingSet(&_empty_bit_list);
+            }
            ex->displayRecognitionError(token_names, msg);
        }
        return msg.str();
@@ -345,4 +359,8 @@ private:
 #endif
 };

+template<typename RecognizerType, typename TokenType, typename ExceptionBaseType>
+typename ExceptionBaseType::BitsetListType
+error_collector<RecognizerType,TokenType,ExceptionBaseType>::_empty_bit_list = typename ExceptionBaseType::BitsetListType();
+
 }
--- a/cql3/restrictions/primary_key_restrictions.hh
+++ b/cql3/restrictions/primary_key_restrictions.hh
@@ -106,6 +106,11 @@ public:
    virtual size_t prefix_size() const {
        return 0;
    }
+
+    size_t prefix_size(const schema_ptr schema) const {
+        return 0;
+    }
+
 };

 template<>
@@ -129,5 +134,23 @@ inline bool primary_key_restrictions<clustering_key>::needs_filtering(const sche
    return false;
 }

+template<>
+inline size_t primary_key_restrictions<clustering_key>::prefix_size(const schema_ptr schema) const {
+    size_t count = 0;
+    if (schema->clustering_key_columns().empty()) {
+        return count;
+    }
+    auto column_defs = get_column_defs();
+    column_id expected_column_id = schema->clustering_key_columns().begin()->id;
+    for (auto&& cdef : column_defs) {
+        if (schema->position(*cdef) != expected_column_id) {
+            return count;
+        }
+        expected_column_id++;
+        count++;
+    }
+    return count;
+}
+
 }
 }
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -166,19 +166,7 @@ public:
    }

    virtual size_t prefix_size() const override {
-        size_t count = 0;
-        if (_schema->clustering_key_columns().empty()) {
-            return count;
-        }
-        column_id expected_column_id = _schema->clustering_key_columns().begin()->id;
-        for (const auto& restriction_entry : _restrictions->restrictions()) {
-            if (_schema->position(*restriction_entry.first) != expected_column_id) {
-                return count;
-            }
-            expected_column_id++;
-            count++;
-        }
-        return count;
+        return primary_key_restrictions<ValueType>::prefix_size(_schema);
    }

    ::shared_ptr<single_column_primary_key_restrictions<clustering_key>> get_longest_prefix_restrictions() {
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -337,6 +337,52 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
    return _index_restrictions;
 }

+std::optional<secondary_index::index> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
+    for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
+        for (const auto& cdef : restriction->get_column_defs()) {
+            for (auto index : sim.list_indexes()) {
+                if (index.depends_on(*cdef)) {
+                    return std::make_optional<secondary_index::index>(std::move(index));
+                }
+            }
+        }
+    }
+    return std::nullopt;
+}
+
+std::vector<const column_definition*> statement_restrictions::get_column_defs_for_filtering(database& db) const {
+    std::vector<const column_definition*> column_defs_for_filtering;
+    if (need_filtering()) {
+        auto& sim = db.find_column_family(_schema).get_index_manager();
+        std::optional<secondary_index::index> opt_idx = find_idx(sim);
+        auto column_uses_indexing = [&opt_idx] (const column_definition* cdef) {
+            return opt_idx && opt_idx->depends_on(*cdef);
+        };
+        if (_partition_key_restrictions->needs_filtering(*_schema)) {
+            for (auto&& cdef : _partition_key_restrictions->get_column_defs()) {
+                if (!column_uses_indexing(cdef)) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
+            column_id first_non_prefix_id = _schema->clustering_key_columns().begin()->id +
+                    _clustering_columns_restrictions->prefix_size(_schema);
+            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
+                if ((cdef->id >= first_non_prefix_id) && (!column_uses_indexing(cdef))) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        for (auto&& cdef : _nonprimary_key_restrictions->get_column_defs()) {
+            if (!column_uses_indexing(cdef)) {
+                column_defs_for_filtering.emplace_back(cdef);
+            }
+        }
+    }
+    return column_defs_for_filtering;
+}
+
 void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
    // If there is a queriable index, no special condition are required on the other restrictions.
    // But we still need to know 2 things:
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -163,6 +163,20 @@ public:
        return _clustering_columns_restrictions;
    }

+    /**
+     * Builds a possibly empty collection of column definitions that will be used for filtering
+     * @param db - the database context
+     * @return A list with the column definitions needed for filtering.
+     */
+    std::vector<const column_definition*> get_column_defs_for_filtering(database& db) const;
+
+    /**
+     * Determines the index to be used with the restriction.
+     * @param db - the database context (for extracting index manager)
+     * @return If an index can be used, an optional containing this index, otherwise an empty optional.
+     */
+    std::optional<secondary_index::index> find_idx(secondary_index::secondary_index_manager& sim) const;
+
    /**
     * Checks if the partition key has some unrestricted components.
     * @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -156,9 +156,9 @@ public:
        return _factories->uses_function(ks_name, function_name);
    }

-    virtual uint32_t add_column_for_ordering(const column_definition& c) override {
-        uint32_t index = selection::add_column_for_ordering(c);
-        _factories->add_selector_for_ordering(c, index);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c) override {
+        uint32_t index = selection::add_column_for_post_processing(c);
+        _factories->add_selector_for_post_processing(c, index);
        return index;
    }

@@ -227,7 +227,7 @@ protected:
    return simple_selection::make(schema, std::move(columns), false);
 }

-uint32_t selection::add_column_for_ordering(const column_definition& c) {
+uint32_t selection::add_column_for_post_processing(const column_definition& c) {
    _columns.push_back(&c);
    _metadata->add_non_serialized_column(c.column_specification);
    return _columns.size() - 1;
@@ -339,14 +339,14 @@ std::unique_ptr<result_set> result_set_builder::build() {
    return std::move(_result_set);
 }

-bool result_set_builder::restrictions_filter::operator()(const selection& selection,
+bool result_set_builder::restrictions_filter::do_filter(const selection& selection,
                                                         const std::vector<bytes>& partition_key,
                                                         const std::vector<bytes>& clustering_key,
                                                         const query::result_row_view& static_row,
                                                         const query::result_row_view& row) const {
    static logging::logger rlogger("restrictions_filter");

-    if (_current_partition_key_does_not_match || _current_static_row_does_not_match) {
+    if (_current_partition_key_does_not_match || _current_static_row_does_not_match || _remaining == 0) {
        return false;
    }

@@ -427,6 +427,20 @@ bool result_set_builder::restrictions_filter::operator()(const selection& select
    return true;
 }

+bool result_set_builder::restrictions_filter::operator()(const selection& selection,
+                                                         const std::vector<bytes>& partition_key,
+                                                         const std::vector<bytes>& clustering_key,
+                                                         const query::result_row_view& static_row,
+                                                         const query::result_row_view& row) const {
+    const bool accepted = do_filter(selection, partition_key, clustering_key, static_row, row);
+    if (!accepted) {
+        ++_rows_dropped;
+    } else if (_remaining > 0) {
+        --_remaining;
+    }
+    return accepted;
+}
+
 api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
    return _timestamps[idx];
 }
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -176,7 +176,7 @@ public:
    static ::shared_ptr<selection> wildcard(schema_ptr schema);
    static ::shared_ptr<selection> for_columns(schema_ptr schema, std::vector<const column_definition*> columns);

-    virtual uint32_t add_column_for_ordering(const column_definition& c);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c);

    virtual bool uses_function(const sstring &ks_name, const sstring& function_name) const {
        return false;
@@ -259,20 +259,31 @@ public:
        }
        void reset() {
        }
+        uint32_t get_rows_dropped() const {
+            return 0;
+        }
    };
    class restrictions_filter {
        ::shared_ptr<restrictions::statement_restrictions> _restrictions;
        const query_options& _options;
        mutable bool _current_partition_key_does_not_match = false;
        mutable bool _current_static_row_does_not_match = false;
+        mutable uint32_t _rows_dropped = 0;
+        mutable uint32_t _remaining = 0;
    public:
        restrictions_filter() = default;
-        explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options) : _restrictions(restrictions), _options(options) {}
+        explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options, uint32_t remaining) : _restrictions(restrictions), _options(options), _remaining(remaining) {}
        bool operator()(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
        void reset() {
            _current_partition_key_does_not_match = false;
            _current_static_row_does_not_match = false;
+            _rows_dropped = 0;
        }
+        uint32_t get_rows_dropped() const {
+            return _rows_dropped;
+        }
+    private:
+        bool do_filter(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
    };

    result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
@@ -372,7 +383,7 @@ public:
            }
        }

-        void accept_partition_end(const query::result_row_view& static_row) {
+        uint32_t accept_partition_end(const query::result_row_view& static_row) {
            if (_row_count == 0) {
                _builder.new_row();
                auto static_row_iterator = static_row.iterator();
@@ -386,6 +397,7 @@ public:
                    }
                }
            }
+            return _filter.get_rows_dropped();
        }
    };

--- a/cql3/selection/selector_factories.cc
+++ b/cql3/selection/selector_factories.cc
@@ -53,6 +53,7 @@ selector_factories::selector_factories(std::vector<::shared_ptr<selectable>> sel
    : _contains_write_time_factory(false)
    , _contains_ttl_factory(false)
    , _number_of_aggregate_factories(0)
+    , _number_of_factories_for_post_processing(0)
 {
    _factories.reserve(selectables.size());

@@ -76,8 +77,9 @@ bool selector_factories::uses_function(const sstring& ks_name, const sstring& fu
    return false;
 }

-void selector_factories::add_selector_for_ordering(const column_definition& def, uint32_t index) {
+void selector_factories::add_selector_for_post_processing(const column_definition& def, uint32_t index) {
    _factories.emplace_back(simple_selector::new_factory(def.name_as_text(), index, def.type));
+    ++_number_of_factories_for_post_processing;
 }

 std::vector<::shared_ptr<selector>> selector_factories::new_instances() const {
--- a/cql3/selection/selector_factories.hh
+++ b/cql3/selection/selector_factories.hh
@@ -74,6 +74,11 @@ private:
     */
    uint32_t _number_of_aggregate_factories;

+    /**
+     * The number of factories that are only for post processing.
+     */
+    uint32_t _number_of_factories_for_post_processing;
+
 public:
    /**
     * Creates a new <code>SelectorFactories</code> instance and collect the column definitions.
@@ -97,11 +102,12 @@ public:
    bool uses_function(const sstring& ks_name, const sstring& function_name) const;

    /**
-     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY purposes.
+     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY or post
+     * processing purposes.
     * @param def the column that is needed for ordering
     * @param index the index of the column definition in the Selection's list of columns
     */
-    void add_selector_for_ordering(const column_definition& def, uint32_t index);
+    void add_selector_for_post_processing(const column_definition& def, uint32_t index);

    /**
     * Checks if this <code>SelectorFactories</code> contains only factories for aggregates.
@@ -111,7 +117,7 @@ public:
     */
    bool contains_only_aggregate_functions() const {
        auto size = _factories.size();
-        return size != 0 && _number_of_aggregate_factories == size;
+        return size != 0 && _number_of_aggregate_factories  == (size - _number_of_factories_for_post_processing);
    }

    /**
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -276,7 +276,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a

        auto type = validate_alter(schema, *def, *validator);
        // In any case, we update the column definition
-        cfm.with_altered_column_type(column_name->name(), type);
+        cfm.alter_column_type(column_name->name(), type);

        // We also have to validate the view types here. If we have a view which includes a column as part of
        // the clustering key, we need to make sure that it is indeed compatible.
@@ -285,7 +285,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
            if (view_def) {
                schema_builder builder(view);
                auto view_type = validate_alter(view, *view_def, *validator);
-                builder.with_altered_column_type(column_name->name(), std::move(view_type));
+                builder.alter_column_type(column_name->name(), std::move(view_type));
                view_updates.push_back(view_ptr(builder.build()));
            }
        }
@@ -306,7 +306,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
        } else {
            for (auto&& column_def : boost::range::join(schema->static_columns(), schema->regular_columns())) { // find
                if (column_def.name() == column_name->name()) {
-                    cfm.without_column(column_name->name());
+                    cfm.remove_column(column_name->name());
                    break;
                }
            }
@@ -349,7 +349,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
            auto to = entry.second->prepare_column_identifier(schema);

            validate_column_rename(db, *schema, *from, *to);
-            cfm.with_column_rename(from->name(), to->name());
+            cfm.rename_column(from->name(), to->name());

            // If the view includes a renamed column, it must be renamed in
            // the view table and the definition.
@@ -360,7 +360,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
                    auto view_from = entry.first->prepare_column_identifier(view);
                    auto view_to = entry.second->prepare_column_identifier(view);
                    validate_column_rename(db, *view, *view_from, *view_to);
-                    builder.with_column_rename(view_from->name(), view_to->name());
+                    builder.rename_column(view_from->name(), view_to->name());

                    auto new_where = util::rename_column_in_where_clause(
                            view->view_info()->where_clause(),
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -110,7 +110,7 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b
            if (t_opt) {
                modified = true;
                // We need to update this column
-                cfm.with_altered_column_type(column.name(), *t_opt);
+                cfm.alter_column_type(column.name(), *t_opt);
            }
        }
        if (modified) {
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -88,6 +88,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
        throw exceptions::invalid_request_exception("Secondary indexes are not supported on materialized views");
    }

+    if (schema->is_dense()) {
+        throw exceptions::invalid_request_exception(
+                "Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns");
+    }
+
    std::vector<::shared_ptr<index_target>> targets;
    for (auto& raw_target : _raw_targets) {
        targets.emplace_back(raw_target->prepare(schema));
@@ -109,6 +114,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
                    sprint("No column definition found for column %s", *target->column));
        }

+        //NOTICE(sarna): Should be lifted after resolving issue #2963
+        if (cd->is_static()) {
+            throw exceptions::invalid_request_exception("Indexing static columns is not implemented yet.");
+        }
+
        if (cd->type->references_duration()) {
            using request_validations::check_false;
            const auto& ty = *cd->type;
@@ -122,8 +132,7 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
        }

        // Origin TODO: we could lift that limitation
-        if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) &&
-            cd->kind != column_kind::regular_column) {
+        if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) && cd->is_primary_key()) {
            throw exceptions::invalid_request_exception(
                    "Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables");
        }
@@ -137,10 +146,15 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c

        bool is_map = dynamic_cast<const collection_type_impl *>(cd->type.get()) != nullptr
                      && dynamic_cast<const collection_type_impl *>(cd->type.get())->is_map();
-        bool is_frozen_collection = cd->type->is_collection() && !cd->type->is_multi_cell();
+        bool is_collection = cd->type->is_collection();
+        bool is_frozen_collection = is_collection && !cd->type->is_multi_cell();

        if (is_frozen_collection) {
            validate_for_frozen_collection(target);
+        } else if (is_collection) {
+            // NOTICE(sarna): should be lifted after #2962 (indexes on non-frozen collections) is implemented
+            throw exceptions::invalid_request_exception(
+                    sprint("Cannot create secondary index on non-frozen collection column %s", cd->name_as_text()));
        } else {
            validate_not_full_index(target);
            validate_is_values_index_if_target_column_not_collection(cd, target);
--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -84,7 +84,6 @@ create_view_statement::create_view_statement(
    , _clustering_keys{clustering_keys}
    , _if_not_exists{if_not_exists}
 {
-    service::get_local_storage_proxy().get_db().local().get_config().check_experimental("Creating materialized views");
    if (!service::get_local_storage_service().cluster_supports_materialized_views()) {
        throw exceptions::invalid_request_exception("Can't create materialized views until the whole cluster has been upgraded");
    }
@@ -315,6 +314,27 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
        throw exceptions::invalid_request_exception(sprint("No columns are defined for Materialized View other than primary key"));
    }

+    // The unique feature of a filter by a non-key column is that the
+    // value of such column can be updated - and also be expired with TTL
+    // and cause the view row to appear and disappear. We don't currently
+    // support support this case - see issue #3430, and neither does
+    // Cassandra - see see CASSANDRA-13798 and CASSANDRA-13832.
+    // Actually, as CASSANDRA-13798 explains, the problem is "the liveness of
+    // view row is now depending on multiple base columns (multiple filtered
+    // non-pk base column + base column used in view pk)". When the filtered
+    // column *is* the base column added to the view pk, we don't have this
+    // problem. And this case actually works correctly.
+    auto non_pk_restrictions = restrictions->get_non_pk_restriction();
+    if (non_pk_restrictions.size() == 1 && has_non_pk_column &&
+            std::find(target_primary_keys.begin(), target_primary_keys.end(), non_pk_restrictions.cbegin()->first) != target_primary_keys.end()) {
+        // This case (filter by new PK column of the view) works, as explained above
+    } else if (!non_pk_restrictions.empty()) {
+        auto column_names = ::join(", ", non_pk_restrictions | boost::adaptors::map_keys | boost::adaptors::transformed(std::mem_fn(&column_definition::name_as_text)));
+        throw exceptions::invalid_request_exception(sprint(
+                "Non-primary key columns cannot be restricted in the SELECT statement used for materialized view %s creation (got restrictions on: %s)",
+                column_family(), column_names));
+    }
+
    schema_builder builder{keyspace(), column_family()};
    auto add_columns = [this, &builder] (std::vector<const column_definition*>& defs, column_kind kind) mutable {
        for (auto* def : defs) {
--- a/cql3/statements/index_prop_defs.cc
+++ b/cql3/statements/index_prop_defs.cc
@@ -49,7 +49,7 @@ void cql3::statements::index_prop_defs::validate() {
    property_definitions::validate(keywords);

    if (is_custom && !custom_class) {
-        throw exceptions::invalid_request_exception("CUSTOM index requires specifiying the index class");
+        throw exceptions::invalid_request_exception("CUSTOM index requires specifying the index class");
    }

    if (!is_custom && custom_class) {
@@ -64,6 +64,16 @@ void cql3::statements::index_prop_defs::validate() {
                sprint("Cannot specify %s as a CUSTOM option",
                        db::index::secondary_index::custom_index_option_name));
    }
+
+    // Currently, Scylla does not support *any* class of custom index
+    // implementation. If in the future we do (e.g., SASI, or something
+    // new), we'll need to check for valid values here.
+    if (is_custom && custom_class) {
+        throw exceptions::invalid_request_exception(
+                format("Unsupported CUSTOM INDEX class {}. Note that currently, Scylla does not support SASI or any other CUSTOM INDEX class.",
+                        *custom_class));
+
+    }
 }

 index_options_map
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -141,6 +141,10 @@ private:
    /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
    void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);

+    void ensure_filtering_columns_retrieval(database& db,
+                                            ::shared_ptr<selection::selection> selection,
+                                            ::shared_ptr<restrictions::statement_restrictions> restrictions);
+
    bool contains_alias(::shared_ptr<column_identifier> name);

    ::shared_ptr<column_specification> limit_receiver();
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -383,8 +383,9 @@ select_statement::do_execute(service::storage_proxy& proxy,
    int32_t limit = get_limit(options);
    auto now = gc_clock::now();

+    const bool restrictions_need_filtering = _restrictions->need_filtering();
    ++_stats.reads;
-    _stats.filtered_reads += _restrictions->need_filtering();
+    _stats.filtered_reads += restrictions_need_filtering;

    auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
        make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));
@@ -396,37 +397,41 @@ select_statement::do_execute(service::storage_proxy& proxy,
    // An aggregation query will never be paged for the user, but we always page it internally to avoid OOM.
    // If we user provided a page_size we'll use that to page internally (because why not), otherwise we use our default
    // Note that if there are some nodes in the cluster with a version less than 2.0, we can't use paging (CASSANDRA-6707).
-    auto aggregate = _selection->is_aggregate();
-    if (aggregate && page_size <= 0) {
+    const bool aggregate = _selection->is_aggregate();
+    const bool nonpaged_filtering = restrictions_need_filtering && page_size <= 0;
+    if (aggregate || nonpaged_filtering) {
        page_size = DEFAULT_COUNT_PAGE_SIZE;
    }

    auto key_ranges = _restrictions->get_partition_key_ranges(options);

-    if (!aggregate && (page_size <= 0
+    if (!aggregate && !restrictions_need_filtering && (page_size <= 0
            || !service::pager::query_pagers::may_need_paging(*_schema, page_size,
                    *command, key_ranges))) {
        return execute(proxy, command, std::move(key_ranges), state, options, now);
    }

    command->slice.options.set<query::partition_slice::option::allow_short_read>();
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout_duration = options.get_timeout_config().*get_timeout_config_selector();
    auto p = service::pager::query_pagers::pager(_schema, _selection,
-            state, options, command, std::move(key_ranges), _stats, _restrictions->need_filtering() ? _restrictions : nullptr);
+            state, options, command, std::move(key_ranges), _stats, restrictions_need_filtering ? _restrictions : nullptr);

-    if (aggregate) {
+    if (aggregate || nonpaged_filtering) {
        return do_with(
                cql3::selection::result_set_builder(*_selection, now,
                        options.get_cql_serialization_format()),
-                [this, p, page_size, now, timeout](auto& builder) {
+                [this, p, page_size, now, timeout_duration, restrictions_need_filtering](auto& builder) {
                    return do_until([p] {return p->is_exhausted();},
-                            [p, &builder, page_size, now, timeout] {
+                            [p, &builder, page_size, now, timeout_duration] {
+                                auto timeout = db::timeout_clock::now() + timeout_duration;
                                return p->fetch_page(builder, page_size, now, timeout);
                            }
-                    ).then([this, &builder] {
+                    ).then([this, &builder, restrictions_need_filtering] {
                                auto rs = builder.build();
+                                if (restrictions_need_filtering) {
+                                    _stats.filtered_rows_matched_total += rs->size();
+                                }
                                update_stats_rows_read(rs->size());
-                                _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
                                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
                            });
@@ -439,7 +444,8 @@ select_statement::do_execute(service::storage_proxy& proxy,
                        " you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
    }

-    if (_selection->is_trivial() && !_restrictions->need_filtering()) {
+    auto timeout = db::timeout_clock::now() + timeout_duration;
+    if (_selection->is_trivial() && !restrictions_need_filtering) {
        return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p, limit] (result_generator generator) {
            auto meta = [&] () -> shared_ptr<const cql3::metadata> {
                if (!p->is_exhausted()) {
@@ -458,14 +464,16 @@ select_statement::do_execute(service::storage_proxy& proxy,
    }

    return p->fetch_page(page_size, now, timeout).then(
-            [this, p, &options, limit, now](std::unique_ptr<cql3::result_set> rs) {
+            [this, p, &options, now, restrictions_need_filtering](std::unique_ptr<cql3::result_set> rs) {

                if (!p->is_exhausted()) {
                    rs->get_metadata().set_paging_state(p->state());
                }

+                if (restrictions_need_filtering) {
+                    _stats.filtered_rows_matched_total += rs->size();
+                }
                update_stats_rows_read(rs->size());
-                _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
            });
@@ -492,15 +500,9 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const clustering_
    return KeyType::from_range(exploded_base_key);
 }

-future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::execute_base_query(
-        service::storage_proxy& proxy,
-        dht::partition_range_vector&& partition_ranges,
-        service::query_state& state,
-        const query_options& options,
-        gc_clock::time_point now,
-        ::shared_ptr<const service::pager::paging_state> paging_state) {
-    auto cmd = ::make_lw_shared<query::read_command>(
+lw_shared_ptr<query::read_command>
+indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) {
+    lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
            _schema->id(),
            _schema->version(),
            make_partition_slice(options),
@@ -510,9 +512,25 @@ indexed_table_select_statement::execute_base_query(
            query::max_partitions,
            utils::UUID(),
            options.get_timestamp(state));
-    if (options.get_page_size() > 0) {
+    if (use_paging) {
        cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
+        cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
+        if (_schema->clustering_key_size() > 0) {
+            cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
+        }
    }
+    return cmd;
+}
+
+future<shared_ptr<cql_transport::messages::result_message>>
+indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        dht::partition_range_vector&& partition_ranges,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
    dht::partition_range_vector per_vnode_ranges;
    per_vnode_ranges.reserve(partition_ranges.size());
@@ -586,19 +604,7 @@ indexed_table_select_statement::execute_base_query(
        const query_options& options,
        gc_clock::time_point now,
        ::shared_ptr<const service::pager::paging_state> paging_state) {
-    auto cmd = make_lw_shared<query::read_command>(
-            _schema->id(),
-            _schema->version(),
-            make_partition_slice(options),
-            get_limit(options),
-            now,
-            tracing::make_trace_info(state.get_trace_state()),
-            query::max_partitions,
-            utils::UUID(),
-            options.get_timestamp(state));
-    if (options.get_page_size() > 0) {
-        cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
-    }
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();

    struct base_query_state {
@@ -714,7 +720,8 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
                                  const query_options& options,
                                  gc_clock::time_point now)
 {
-    bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !_restrictions->need_filtering();
+    const bool restrictions_need_filtering = _restrictions->need_filtering();
+    const bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !restrictions_need_filtering;
    if (fast_path) {
        return make_shared<cql_transport::messages::result_message::rows>(result(
            result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
@@ -724,12 +731,12 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu

    cql3::selection::result_set_builder builder(*_selection, now,
            options.get_cql_serialization_format());
-    if (_restrictions->need_filtering()) {
+    if (restrictions_need_filtering) {
        results->ensure_counts();
        _stats.filtered_rows_read_total += *results->row_count();
        query::result_view::consume(*results, cmd->slice,
                cql3::selection::result_set_builder::visitor(builder, *_schema,
-                        *_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions, options)));
+                        *_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
    } else {
        query::result_view::consume(*results, cmd->slice,
                cql3::selection::result_set_builder::visitor(builder, *_schema,
@@ -745,7 +752,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
        rs->trim(cmd->row_limit);
    }
    update_stats_rows_read(rs->size());
-    _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
+    _stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
    return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
 }

@@ -774,7 +781,8 @@ indexed_table_select_statement::prepare(database& db,
                                        ordering_comparator_type ordering_comparator,
                                        ::shared_ptr<term> limit, cql_stats &stats)
 {
-    auto index_opt = find_idx(db, schema, restrictions);
+    auto& sim = db.find_column_family(schema).get_index_manager();
+    auto index_opt = restrictions->find_idx(sim);
    if (!index_opt) {
        throw std::runtime_error("No index found.");
    }
@@ -798,24 +806,6 @@ indexed_table_select_statement::prepare(database& db,

 }

-
-stdx::optional<secondary_index::index> indexed_table_select_statement::find_idx(database& db,
-                                                                                schema_ptr schema,
-                                                                                ::shared_ptr<restrictions::statement_restrictions> restrictions)
-{
-    auto& sim = db.find_column_family(schema).get_index_manager();
-    for (::shared_ptr<cql3::restrictions::restrictions> restriction : restrictions->index_restrictions()) {
-        for (const auto& cdef : restriction->get_column_defs()) {
-            for (auto index : sim.list_indexes()) {
-                if (index.depends_on(*cdef)) {
-                    return stdx::make_optional<secondary_index::index>(std::move(index));
-                }
-            }
-        }
-    }
-    return stdx::nullopt;
-}
-
 indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
                                                           ::shared_ptr<parameters> parameters,
                                                           ::shared_ptr<selection::selection> selection,
@@ -882,7 +872,6 @@ static void append_base_key_to_index_ck(std::vector<bytes_view>& exploded_index_
    auto paging_state_copy = ::make_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
-    paging_state_copy->set_remaining(query::max_rows);
    return std::move(paging_state_copy);
 }

@@ -1219,6 +1208,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
    }

    check_needs_filtering(restrictions);
+    ensure_filtering_columns_retrieval(db, selection, restrictions);

    ::shared_ptr<cql3::statements::select_statement> stmt;
    if (restrictions->uses_secondary_indexing()) {
@@ -1357,7 +1347,7 @@ select_statement::get_ordering_comparator(schema_ptr schema,
        }
        auto index = selection->index_of(*def);
        if (index < 0) {
-            index = selection->add_column_for_ordering(*def);
+            index = selection->add_column_for_post_processing(*def);
        }

        sorters.emplace_back(index, def->type);
@@ -1444,6 +1434,23 @@ void select_statement::check_needs_filtering(::shared_ptr<restrictions::statemen
    }
 }

+/**
+ * Adds columns that are needed for the purpose of filtering to the selection.
+ * The columns that are added to the selection are columns that
+ * are needed for filtering on the coordinator but are not part of the selection.
+ * The columns are added with a meta-data indicating they are not to be returned
+ * to the user.
+ */
+void select_statement::ensure_filtering_columns_retrieval(database& db,
+                                        ::shared_ptr<selection::selection> selection,
+                                        ::shared_ptr<restrictions::statement_restrictions> restrictions) {
+    for (auto&& cdef : restrictions->get_column_defs_for_filtering(db)) {
+        if (!selection->has_column(*cdef)) {
+            selection->add_column_for_post_processing(*cdef);
+        }
+    }
+}
+
 bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
    return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
        return raw->alias && *name == *raw->alias;
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -186,10 +186,6 @@ public:
                                   schema_ptr view_schema);

 private:
-    static stdx::optional<secondary_index::index> find_idx(database& db,
-                                                           schema_ptr schema,
-                                                           ::shared_ptr<restrictions::statement_restrictions> restrictions);
-
    virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
                                                                                     service::query_state& state, const query_options& options) override;

@@ -214,6 +210,9 @@ private:
            gc_clock::time_point now,
            ::shared_ptr<const service::pager::paging_state> paging_state);

+    lw_shared_ptr<query::read_command>
+    prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);
+
    future<shared_ptr<cql_transport::messages::result_message>>
    execute_base_query(
            service::storage_proxy& proxy,
--- a/database.cc
+++ b/database.cc
@@ -76,6 +76,8 @@
 #include "sstables/compaction_manager.hh"
 #include "sstables/compaction_backlog_manager.hh"
 #include "sstables/progress_monitor.hh"
+#include "auth/common.hh"
+#include "tracing/trace_keyspace_helper.hh"

 #include "checked-file-impl.hh"
 #include "disk-error-handler.hh"
@@ -178,6 +180,18 @@ bool is_system_keyspace(const sstring& name) {
    return system_keyspaces.find(name) != system_keyspaces.end();
 }

+static const std::unordered_set<sstring> internal_keyspaces = {
+        db::system_distributed_keyspace::NAME,
+        db::system_keyspace::NAME,
+        db::schema_tables::NAME,
+        auth::meta::AUTH_KS,
+        tracing::trace_keyspace_helper::KEYSPACE_NAME
+};
+
+bool is_internal_keyspace(const sstring& name) {
+    return internal_keyspaces.find(name) != internal_keyspaces.end();
+}
+
 // Used for tests where the CF exists without a database object. We need to pass a valid
 // dirty_memory manager in that case.
 thread_local dirty_memory_manager default_dirty_memory_manager;
@@ -684,9 +698,11 @@ table::make_reader(schema_ptr s,
    return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
 }

-sstables::shared_sstable
-table::make_streaming_sstable_for_write() {
+sstables::shared_sstable table::make_streaming_sstable_for_write(std::optional<sstring> subdir) {
    sstring dir = _config.datadir;
+    if (subdir) {
+        dir += "/" + *subdir;
+    }
    auto newtab = sstables::make_sstable(_schema,
            dir, calculate_generation_for_new_table(),
            get_highest_supported_format(),
@@ -826,7 +842,11 @@ void table::add_sstable(sstables::shared_sstable sstable, const std::vector<unsi
    new_sstables->insert(sstable);
    _sstables = std::move(new_sstables);
    update_stats_for_new_sstable(sstable->bytes_on_disk(), shards_for_the_sstable);
-    _compaction_strategy.get_backlog_tracker().add_sstable(sstable);
+    if (sstable->is_staging()) {
+        _sstables_staging.emplace(sstable->generation(), sstable);
+    } else {
+        _compaction_strategy.get_backlog_tracker().add_sstable(sstable);
+    }
 }

 future<>
@@ -1082,12 +1102,14 @@ table::start() {
 future<>
 table::stop() {
    return _async_gate.close().then([this] {
-        return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
-            return _compaction_manager.remove(this).then([this] {
-                // Nest, instead of using when_all, so we don't lose any exceptions.
-                return _streaming_flush_gate.close();
-            }).then([this] {
-                return _sstable_deletion_gate.close();
+        return when_all(await_pending_writes(), await_pending_reads()).discard_result().finally([this] {
+            return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
+                return _compaction_manager.remove(this).then([this] {
+                    // Nest, instead of using when_all, so we don't lose any exceptions.
+                    return _streaming_flush_gate.close();
+                }).then([this] {
+                    return _sstable_deletion_gate.close();
+                });
            });
        });
    });
@@ -1346,6 +1368,7 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new

    // This is done in the background, so we can consider this compaction completed.
    seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
+       return with_semaphore(_sstable_deletion_sem, 1, [this, sstables_to_remove = std::move(sstables_to_remove)] {
        return sstables::delete_atomically(sstables_to_remove, *get_large_partition_handler()).then_wrapped([this, sstables_to_remove] (future<> f) {
            std::exception_ptr eptr;
            try {
@@ -1369,6 +1392,7 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new
                return make_exception_future<>(eptr);
            }
            return make_ready_future<>();
+         });
        }).then([this] {
            // refresh underlying data source in row cache to prevent it from holding reference
            // to sstables files which were previously deleted.
@@ -1613,7 +1637,9 @@ std::vector<sstables::shared_sstable> table::select_sstables(const dht::partitio

 std::vector<sstables::shared_sstable> table::candidates_for_compaction() const {
    return boost::copy_range<std::vector<sstables::shared_sstable>>(*get_sstables()
-        | boost::adaptors::filtered([this] (auto& sst) { return !_sstables_need_rewrite.count(sst->generation()); }));
+            | boost::adaptors::filtered([this] (auto& sst) {
+        return !_sstables_need_rewrite.count(sst->generation()) && !_sstables_staging.count(sst->generation());
+    }));
 }

 std::vector<sstables::shared_sstable> table::sstables_need_rewrite() const {
@@ -1671,9 +1697,9 @@ future<> distributed_loader::open_sstable(distributed<database>& db, sstables::e
    // to distribute evenly the resource usage among all shards.

    return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
-            [&db, comps = std::move(comps), func = std::move(func), pc] (database& local) {
+            [&db, comps = std::move(comps), func = std::move(func), &pc] (database& local) {

-        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), pc] {
+        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), &pc] {
            auto& cf = local.find_column_family(comps.ks, comps.cf);

            auto f = sstables::sstable::load_shared_components(cf.schema(), comps.sstdir, comps.generation, comps.version, comps.format, pc);
@@ -1969,6 +1995,12 @@ future<sstables::entry_descriptor> distributed_loader::probe_file(distributed<da
    }
    auto cf_sstable_open = [sstdir, comps, fname] (column_family& cf, sstables::foreign_sstable_open_info info) {
        cf.update_sstables_known_generation(comps.generation);
+        if (shared_sstable sst = cf.get_staging_sstable(comps.generation)) {
+            dblog.warn("SSTable {} is already present in staging/ directory. Moving from staging will be retried.", sst->get_filename());
+            return seastar::async([sst = std::move(sst), comps = std::move(comps)] () {
+                sst->move_to_new_dir_in_thread(comps.sstdir, comps.generation);
+            });
+        }
        {
            auto i = boost::range::find_if(*cf._sstables->all(), [gen = comps.generation] (sstables::shared_sstable sst) { return sst->generation() == gen; });
            if (i != cf._sstables->all()->end()) {
@@ -2154,9 +2186,6 @@ database::database(const db::config& cfg, database_config dbcfg)
        [this] {
            ++_stats->sstable_read_queue_overloaded;
            return std::make_exception_ptr(std::runtime_error("sstable inactive read queue overloaded"));
-        },
-        [this] {
-            return _querier_cache.evict_one();
        })
    // No timeouts or queue length limits - a failure here can kill an entire repair.
    // Trust the caller to limit concurrency.
@@ -2168,12 +2197,11 @@ database::database(const db::config& cfg, database_config dbcfg)
    , _version(empty_version)
    , _compaction_manager(make_compaction_manager(*_cfg, dbcfg))
    , _enable_incremental_backups(cfg.incremental_backups())
-    , _querier_cache(dbcfg.available_memory * 0.04)
+    , _querier_cache(_read_concurrency_sem, dbcfg.available_memory * 0.04)
    , _large_partition_handler(std::make_unique<db::cql_table_large_partition_handler>(_cfg->compaction_large_partition_warning_threshold_mb()*1024*1024))
    , _result_memory_limiter(dbcfg.available_memory / 10)
 {
    local_schema_registry().init(*this); // TODO: we're never unbound.
-    _compaction_manager->start();
    setup_metrics();

    _row_cache_tracker.set_compaction_scheduling_group(dbcfg.memory_compaction_scheduling_group);
@@ -2299,6 +2327,9 @@ database::setup_metrics() {
                       sm::description("Counts sstables that survived the clustering key filtering. "
                                       "High value indicates that bloom filter is not very efficient and still have to access a lot of sstables to get data.")),

+        sm::make_derive("dropped_view_updates", _cf_stats.dropped_view_updates,
+                       sm::description("Counts the number of view updates that have been dropped due to cluster overload. ")),
+
        sm::make_derive("total_writes", _stats->total_writes,
                       sm::description("Counts the total number of successful write operations performed by this shard.")),

@@ -2316,6 +2347,9 @@ database::setup_metrics() {
                       sm::description("Counts the total number of failed read operations. "
                                       "Add the total_reads to this value to get the total amount of reads issued on this shard.")),

+        sm::make_current_bytes("view_update_backlog", [this] { return get_view_update_backlog().current; },
+                       sm::description("Holds the current size in bytes of the pending view updates for all tables")),
+
        sm::make_derive("querier_cache_lookups", _querier_cache.get_stats().lookups,
                       sm::description("Counts querier cache lookups (paging queries)")),

@@ -2420,6 +2454,9 @@ database::setup_metrics() {
 }

 database::~database() {
+    _read_concurrency_sem.clear_inactive_reads();
+    _streaming_concurrency_sem.clear_inactive_reads();
+    _system_read_concurrency_sem.clear_inactive_reads();
 }

 void database::update_version(const utils::UUID& version) {
@@ -2450,6 +2487,8 @@ future<> distributed_loader::populate_keyspace(distributed<database>& db, sstrin
                auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
                dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
                return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
+                    return distributed_loader::populate_column_family(db, sstdir + "/staging", ks_name, cfname);
+                }).then([&db, sstdir, uuid, ks_name, cfname] {
                    return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
                }).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
                    std::string msg =
@@ -2903,6 +2942,7 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
    cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics();
    cfg.large_partition_handler = lp_handler;
    cfg.view_update_concurrency_semaphore = _config.view_update_concurrency_semaphore;
+    cfg.view_update_concurrency_semaphore_limit = _config.view_update_concurrency_semaphore_limit;

    return cfg;
 }
@@ -2930,6 +2970,7 @@ keyspace::make_directory_for_column_family(const sstring& name, utils::UUID uuid
            io_check(recursive_touch_directory, cfdir).get();
        }
        io_check(touch_directory, cfdirs[0] + "/upload").get();
+        io_check(touch_directory, cfdirs[0] + "/staging").get();
    });
 }

@@ -3699,6 +3740,7 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
    cfg.enable_metrics_reporting = _cfg->enable_keyspace_column_family_metrics();

    cfg.view_update_concurrency_semaphore = &_view_update_concurrency_sem;
+    cfg.view_update_concurrency_semaphore_limit = max_memory_pending_view_updates();
    return cfg;
 }

@@ -3796,6 +3838,8 @@ database::stop() {
        return parallel_for_each(_column_families, [this] (auto& val_pair) {
            return val_pair.second->stop();
        });
+    }).then([this] {
+        return _view_update_concurrency_sem.wait(max_memory_pending_view_updates());
    }).then([this] {
        if (_commitlog != nullptr) {
            return _commitlog->release();
@@ -4051,6 +4095,7 @@ seal_snapshot(sstring jsondir) {

 future<> table::snapshot(sstring name) {
    return flush().then([this, name = std::move(name)]() {
+       return with_semaphore(_sstable_deletion_sem, 1, [this, name = std::move(name)]() {
        auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
        return do_with(std::move(tables), [this, name](std::vector<sstables::shared_sstable> & tables) {
            auto jsondir = _config.datadir + "/snapshots/" + name;
@@ -4110,6 +4155,7 @@ future<> table::snapshot(sstring name) {
                });
            });
        });
+       });
    });
 }

@@ -4239,6 +4285,7 @@ future<> table::fail_streaming_mutations(utils::UUID plan_id) {
    _streaming_memtables_big.erase(it);
    return entry->flush_in_progress.close().then([this, entry] {
        for (auto&& sst : entry->sstables) {
+            sst.monitor->write_failed();
            sst.sstable->mark_for_deletion();
        }
    });
@@ -4417,6 +4464,14 @@ std::vector<view_ptr> table::affected_views(const schema_ptr& base, const mutati
    }));
 }

+static size_t memory_usage_of(const std::vector<frozen_mutation_and_schema>& ms) {
+    // Overhead of sending a view mutation, in terms of data structures used by the storage_proxy.
+    constexpr size_t base_overhead_bytes = 256;
+    return boost::accumulate(ms | boost::adaptors::transformed([] (const frozen_mutation_and_schema& m) {
+        return m.fm.representation().size();
+    }), size_t{base_overhead_bytes * ms.size()});
+}
+
 /**
 * Given some updates on the base table and the existing values for the rows affected by that update, generates the
 * mutations to be applied to the base table's views, and sends them to the paired view replicas.
@@ -4433,75 +4488,15 @@ std::vector<view_ptr> table::affected_views(const schema_ptr& base, const mutati
 future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
        std::vector<view_ptr>&& views,
        mutation&& m,
-        flat_mutation_reader_opt existings,
-        db::timeout_clock::time_point timeout) const {
+        flat_mutation_reader_opt existings) const {
    auto base_token = m.token();
-    return db::view::generate_view_updates(base,
-                        std::move(views),
-                        flat_mutation_reader_from_mutations({std::move(m)}),
-                        std::move(existings)).then([this, timeout, base_token = std::move(base_token)] (auto&& updates) mutable {
-        return seastar::get_units(*_config.view_update_concurrency_semaphore, 1, timeout).then(
-                [this, base_token = std::move(base_token), updates = std::move(updates)] (auto units) mutable {
-            db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats).handle_exception([units = std::move(units)] (auto ignored) { });
-        });
-    });
-}
-
-/**
- * Given an update for the base table, calculates the set of potentially affected views,
- * generates the relevant updates, and sends them to the paired view replicas.
- */
-future<row_locker::lock_holder> table::push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const {
-    //FIXME: Avoid unfreezing here.
-    auto m = fm.unfreeze(s);
-    auto& base = schema();
-    m.upgrade(base);
-    auto views = affected_views(base, m);
-    if (views.empty()) {
-        return make_ready_future<row_locker::lock_holder>();
-    }
-    auto cr_ranges = db::view::calculate_affected_clustering_ranges(*base, m.decorated_key(), m.partition(), views);
-    if (cr_ranges.empty()) {
-        return generate_and_propagate_view_updates(base, std::move(views), std::move(m), { }, timeout).then([] {
-                // In this case we are not doing a read-before-write, just a
-                // write, so no lock is needed.
-                return make_ready_future<row_locker::lock_holder>();
-        });
-    }
-    // We read the whole set of regular columns in case the update now causes a base row to pass
-    // a view's filters, and a view happens to include columns that have no value in this update.
-    // Also, one of those columns can determine the lifetime of the base row, if it has a TTL.
-    auto columns = boost::copy_range<std::vector<column_id>>(
-            base->regular_columns() | boost::adaptors::transformed(std::mem_fn(&column_definition::id)));
-    query::partition_slice::option_set opts;
-    opts.set(query::partition_slice::option::send_partition_key);
-    opts.set(query::partition_slice::option::send_clustering_key);
-    opts.set(query::partition_slice::option::send_timestamp);
-    opts.set(query::partition_slice::option::send_ttl);
-    auto slice = query::partition_slice(
-            std::move(cr_ranges), { }, std::move(columns), std::move(opts), { }, cql_serialization_format::internal(), query::max_rows);
-    // Take the shard-local lock on the base-table row or partition as needed.
-    // We'll return this lock to the caller, which will release it after
-    // writing the base-table update.
-    future<row_locker::lock_holder> lockf = local_base_lock(base, m.decorated_key(), slice.default_row_ranges(), timeout);
-    return lockf.then([m = std::move(m), slice = std::move(slice), views = std::move(views), base, this, timeout] (row_locker::lock_holder lock) {
-      return do_with(
-        dht::partition_range::make_singular(m.decorated_key()),
-        std::move(slice),
-        std::move(m),
-        [base, views = std::move(views), lock = std::move(lock), this, timeout] (auto& pk, auto& slice, auto& m) mutable {
-            auto reader = this->make_reader(
-                base,
-                pk,
-                slice,
-                service::get_local_sstable_query_read_priority());
-            return this->generate_and_propagate_view_updates(base, std::move(views), std::move(m), std::move(reader), timeout).then([lock = std::move(lock)] () mutable {
-                // return the local partition/row lock we have taken so it
-                // remains locked until the caller is done modifying this
-                // partition/row and destroys the lock object.
-                return std::move(lock);
-            });
-      });
+    return db::view::generate_view_updates(
+            base,
+            std::move(views),
+            flat_mutation_reader_from_mutations({std::move(m)}),
+            std::move(existings)).then([this, base_token = std::move(base_token)] (std::vector<frozen_mutation_and_schema>&& updates) mutable {
+        auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(updates));
+        db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats, std::move(units)).handle_exception([] (auto ignored) { });
    });
 }

@@ -4606,8 +4601,17 @@ future<> table::populate_views(
            schema,
            std::move(views),
            std::move(reader),
-            { }).then([base_token = std::move(base_token), this] (auto&& updates) {
-        return db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats);
+            { }).then([base_token = std::move(base_token), this] (std::vector<frozen_mutation_and_schema>&& updates) mutable {
+        size_t update_size = memory_usage_of(updates);
+        size_t units_to_wait_for = std::min(_config.view_update_concurrency_semaphore_limit, update_size);
+        return seastar::get_units(*_config.view_update_concurrency_semaphore, units_to_wait_for).then(
+                [base_token = std::move(base_token),
+                 updates = std::move(updates),
+                 units_to_consume = update_size - units_to_wait_for,
+                 this] (db::timeout_semaphore_units&& units) mutable {
+            units.adopt(seastar::consume_units(*_config.view_update_concurrency_semaphore, units_to_consume));
+            return db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats, std::move(units));
+        });
    });
 }

--- a/database.hh
+++ b/database.hh
@@ -77,6 +77,7 @@
 #include <seastar/core/metrics_registration.hh>
 #include "tracing/trace_state.hh"
 #include "db/view/view.hh"
+#include "db/view/view_update_backlog.hh"
 #include "db/view/row_locking.hh"
 #include "lister.hh"
 #include "utils/phased_barrier.hh"
@@ -279,6 +280,9 @@ struct cf_stats {
    int64_t clustering_filter_fast_path_count = 0;
    // how many sstables survived the clustering key checks
    int64_t surviving_sstables_after_clustering_filter = 0;
+
+    // How many view updates were dropped due to overload.
+    int64_t dropped_view_updates = 0;
 };

 class cache_temperature {
@@ -298,6 +302,8 @@ public:
 class table;
 using column_family = table;

+class database_sstable_write_monitor;
+
 class table : public enable_lw_shared_from_this<table> {
 public:
    struct config {
@@ -323,6 +329,7 @@ public:
        bool enable_metrics_reporting = false;
        db::large_partition_handler* large_partition_handler;
        db::timeout_semaphore* view_update_concurrency_semaphore;
+        size_t view_update_concurrency_semaphore_limit;
    };
    struct no_commitlog {};
    struct stats {
@@ -395,7 +402,7 @@ private:
    // plan memtables and the resulting sstables are not made visible until
    // the streaming is complete.
    struct monitored_sstable {
-        std::unique_ptr<sstables::write_monitor> monitor;
+        std::unique_ptr<database_sstable_write_monitor> monitor;
        sstables::shared_sstable sstable;
    };

@@ -432,8 +439,15 @@ private:
    // but for correct compaction we need to start the compaction only after
    // reading all sstables.
    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
+    // sstables that should not be compacted (e.g. because they need to be used
+    // to generate view updates later)
+    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_staging;
    // Control background fibers waiting for sstables to be deleted
    seastar::gate _sstable_deletion_gate;
+    // This semaphore ensures that an operation like snapshot won't have its selected
+    // sstables deleted by compaction in parallel, a race condition which could
+    // easily result in failure.
+    seastar::semaphore _sstable_deletion_sem = {1};
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
    rwlock _sstables_lock;
@@ -485,6 +499,11 @@ private:
    utils::phased_barrier _pending_reads_phaser;
 public:
    future<> add_sstable_and_update_cache(sstables::shared_sstable sst);
+    void move_sstable_from_staging_in_thread(sstables::shared_sstable sst);
+    sstables::shared_sstable get_staging_sstable(uint64_t generation) {
+        auto it = _sstables_staging.find(generation);
+        return it != _sstables_staging.end() ? it->second : nullptr;
+    }
 private:
    void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
    // Adds new sstable to the set of sstables
@@ -618,6 +637,14 @@ public:
            tracing::trace_state_ptr trace_state = nullptr,
            streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
+    flat_mutation_reader make_reader_excluding_sstable(schema_ptr schema,
+            sstables::shared_sstable sst,
+            const dht::partition_range& range,
+            const query::partition_slice& slice,
+            const io_priority_class& pc = default_priority_class(),
+            tracing::trace_state_ptr trace_state = nullptr,
+            streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
+            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;

    flat_mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
        auto& full_slice = schema->full_slice();
@@ -632,9 +659,13 @@ public:
    flat_mutation_reader make_streaming_reader(schema_ptr schema,
            const dht::partition_range_vector& ranges) const;

-    sstables::shared_sstable make_streaming_sstable_for_write();
+    sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
+    sstables::shared_sstable make_streaming_staging_sstable() {
+        return make_streaming_sstable_for_write("staging");
+    }

    mutation_source as_mutation_source() const;
+    mutation_source as_mutation_source_excluding(sstables::shared_sstable sst) const;

    void set_virtual_reader(mutation_source virtual_reader) {
        _virtual_reader = std::move(virtual_reader);
@@ -842,6 +873,8 @@ public:
    void clear_views();
    const std::vector<view_ptr>& views() const;
    future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const;
+    future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout) const;
+    future<row_locker::lock_holder> stream_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, sstables::shared_sstable excluded_sstable) const;
    void add_coordinator_read_latency(utils::estimated_histogram::duration latency);
    std::chrono::milliseconds get_coordinator_read_latency_percentile(double percentile);

@@ -859,13 +892,17 @@ public:
            dht::token base_token,
            flat_mutation_reader&&);

+    reader_concurrency_semaphore& read_concurrency_semaphore() {
+        return *_config.read_concurrency_semaphore;
+    }
+
 private:
+    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
            std::vector<view_ptr>&& views,
            mutation&& m,
-            flat_mutation_reader_opt existings,
-            db::timeout_clock::time_point timeout) const;
+            flat_mutation_reader_opt existings) const;

    mutable row_locker _row_locker;
    future<row_locker::lock_holder> local_base_lock(
@@ -1055,6 +1092,7 @@ public:
        seastar::scheduling_group streaming_scheduling_group;
        bool enable_metrics_reporting = false;
        db::timeout_semaphore* view_update_concurrency_semaphore = nullptr;
+        size_t view_update_concurrency_semaphore_limit;
    };
 private:
    std::unique_ptr<locator::abstract_replication_strategy> _replication_strategy;
@@ -1156,6 +1194,7 @@ private:
    static const size_t max_count_system_concurrent_reads{10};
    size_t max_memory_system_concurrent_reads() { return _dbcfg.available_memory * 0.02; };
    static constexpr size_t max_concurrent_sstable_loads() { return 3; }
+    size_t max_memory_pending_view_updates() const { return _dbcfg.available_memory * 0.1; }

    struct db_stats {
        uint64_t total_writes = 0;
@@ -1192,7 +1231,7 @@ private:

    semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};

-    db::timeout_semaphore _view_update_concurrency_sem{100}; // Stand-in hack for #2538
+    db::timeout_semaphore _view_update_concurrency_sem{max_memory_pending_view_updates()};

    cache_tracker _row_cache_tracker;

@@ -1399,6 +1438,12 @@ public:
    std::unordered_set<sstring> get_initial_tokens();
    std::experimental::optional<gms::inet_address> get_replace_address();
    bool is_replacing();
+    reader_concurrency_semaphore& user_read_concurrency_sem() {
+        return _read_concurrency_sem;
+    }
+    reader_concurrency_semaphore& streaming_read_concurrency_sem() {
+        return _streaming_concurrency_sem;
+    }
    reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
        return _system_read_concurrency_sem;
    }
@@ -1423,11 +1468,17 @@ public:
        return _querier_cache;
    }

+    db::view::update_backlog get_view_update_backlog() const {
+        return {max_memory_pending_view_updates() - _view_update_concurrency_sem.current(), max_memory_pending_view_updates()};
+    }
+
    friend class distributed_loader;
 };

 future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy);

+bool is_internal_keyspace(const sstring& name);
+
 class distributed_loader {
 public:
    static void reshard(distributed<database>& db, sstring ks_name, sstring cf_name);
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1673,14 +1673,14 @@ const db::commitlog::config& db::commitlog::active_config() const {
 // No commit_io_check needed in the log reader since the database will fail
 // on error at startup if required
 future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
-db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off, const db::extensions* exts) {
+db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class read_io_prio_class, commit_load_reader_func next, position_type off, const db::extensions* exts) {
    struct work {
    private:
-        file_input_stream_options make_file_input_stream_options() {
+        file_input_stream_options make_file_input_stream_options(seastar::io_priority_class read_io_prio_class) {
            file_input_stream_options fo;
            fo.buffer_size = db::commitlog::segment::default_size;
            fo.read_ahead = 10;
-            fo.io_priority_class = service::get_local_commitlog_priority();
+            fo.io_priority_class = read_io_prio_class;
            return fo;
        }
    public:
@@ -1699,8 +1699,8 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        bool header = true;
        bool failed = false;

-        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
+        work(file f, seastar::io_priority_class read_io_prio_class, position_type o = 0)
+                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options(read_io_prio_class))), start_off(o) {
        }
        work(work&&) = default;

@@ -1918,9 +1918,9 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        return fut;
    });

-    return fut.then([off, next](file f) {
+    return fut.then([off, next, read_io_prio_class] (file f) {
        f = make_checked_file(commit_error_handler, std::move(f));
-        auto w = make_lw_shared<work>(std::move(f), off);
+        auto w = make_lw_shared<work>(std::move(f), read_io_prio_class, off);
        auto ret = w->s.listen(next);

        w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -355,7 +355,7 @@ public:
    };

    static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
-            const sstring&, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
+            const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
 private:
    commitlog(config);

--- a/db/commitlog/commitlog_entry.hh
+++ b/db/commitlog/commitlog_entry.hh
@@ -34,7 +34,8 @@ public:
    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
        : _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
    const stdx::optional<column_mapping>& mapping() const { return _mapping; }
-    const frozen_mutation& mutation() const { return _mutation; }
+    const frozen_mutation& mutation() const & { return _mutation; }
+    frozen_mutation&& mutation() && { return std::move(_mutation); }
 };

 class commitlog_entry_writer {
@@ -80,5 +81,6 @@ public:
    commitlog_entry_reader(const temporary_buffer<char>& buffer);

    const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
-    const frozen_mutation& mutation() const { return _ce.mutation(); }
+    const frozen_mutation& mutation() const & { return _ce.mutation(); }
+    frozen_mutation&& mutation() && { return std::move(_ce).mutation(); }
 };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -58,6 +58,7 @@
 #include "converting_mutation_partition_applier.hh"
 #include "schema_registry.hh"
 #include "commitlog_entry.hh"
+#include "service/priority_manager.hh"

 static logging::logger rlogger("commitlog_replayer");

@@ -223,7 +224,7 @@ db::commitlog_replayer::impl::recover(sstring file, const sstring& fname_prefix)
    auto s = make_lw_shared<stats>();
    auto& exts = _qp.local().db().local().get_config().extensions();

-    return db::commitlog::read_log_file(file,
+    return db::commitlog::read_log_file(file, service::get_local_commitlog_priority(),
            std::bind(&impl::process, this, s.get(), std::placeholders::_1,
                    std::placeholders::_2), p, &exts).then([](auto s) {
        auto f = s->done();
--- a/db/config.hh
+++ b/db/config.hh
@@ -155,6 +155,9 @@ public:
    val(hints_directory, sstring, "/var/lib/scylla/hints", Used,   \
            "The directory where hints files are stored if hinted handoff is enabled."   \
    )                                           \
+    val(view_hints_directory, sstring, "/var/lib/scylla/view_hints", Used,   \
+            "The directory where materialized-view updates are stored while a view replica is unreachable."   \
+    )                                           \
    val(saved_caches_directory, sstring, "/var/lib/scylla/saved_caches", Unused, \
            "The directory location where table key and row caches are stored."  \
    )                                                   \
@@ -453,7 +456,7 @@ public:
            "The maximum number of tombstones a query can scan before aborting."  \
    )   \
    /* Network timeout settings */  \
-    val(range_request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(range_request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The time in milliseconds that the coordinator waits for sequential or index scans to complete."  \
    )   \
    val(read_request_timeout_in_ms, uint32_t, 5000, Used,     \
@@ -472,7 +475,7 @@ public:
            "The time in milliseconds that the coordinator waits for write operations to complete.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
-    val(request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The default timeout for other, miscellaneous operations.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
@@ -578,7 +581,7 @@ public:
    val(dynamic_snitch_update_interval_in_ms, uint32_t, 100, Unused,     \
            "The time interval for how often the snitch calculates node scores. Because score calculation is CPU intensive, be careful when reducing this interval."  \
    )   \
-    val(hinted_handoff_enabled, sstring, "false", Used,     \
+    val(hinted_handoff_enabled, sstring, "true", Used,     \
            "Enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
            "Related information: About hinted handoff writes"  \
    )   \
@@ -621,7 +624,7 @@ public:
    val(thrift_framed_transport_size_in_mb, uint32_t, 15, Unused,     \
            "Frame size (maximum field length) for Thrift. The frame is the row or part of the row the application is inserting."  \
    )   \
-    val(thrift_max_message_length_in_mb, uint32_t, 16, Unused,     \
+    val(thrift_max_message_length_in_mb, uint32_t, 16, Used,     \
            "The maximum length of a Thrift message in megabytes, including all fields and internal Thrift overhead (1 byte of overhead for each frame). Message length is usually used in conjunction with batches. A frame length greater than or equal to 24 accommodates a batch with four inserts, each of which is 24 bytes. The required message length is greater than or equal to 24+24+24+24+4 (number of frames)."  \
    )   \
    /* Security properties */   \
@@ -739,7 +742,7 @@ public:
        " Performance is affected to some extent as a result. Useful to help debugging problems that may arise at another layers.") \
    val(cpu_scheduler, bool, true, Used, "Enable cpu scheduling") \
    val(view_building, bool, true, Used, "Enable view building; should only be set to false when the node is experience issues due to view building") \
-    val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format; FOR TESTING PURPOSES ONLY - TO BE REMOVED BEFORE RELEASE") \
+    val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -35,6 +35,7 @@
 #include "disk-error-handler.hh"
 #include "lister.hh"
 #include "db/timeout_clock.hh"
+#include "service/priority_manager.hh"

 using namespace std::literals::chrono_literals;

@@ -95,6 +96,7 @@ future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr
        return compute_hints_dir_device_id();
    }).then([this] {
        _strorage_service_anchor->register_subscriber(this);
+        set_started();
    });
 }

@@ -105,7 +107,7 @@ future<> manager::stop() {
        _strorage_service_anchor->unregister_subscriber(this);
    }

-    _stopping = true;
+    set_stopping();

    return _draining_eps_gate.close().finally([this] {
        return parallel_for_each(_ep_managers, [] (auto& pair) {
@@ -277,7 +279,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
 }

 bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
-    if (_stopping || !can_hint_for(ep)) {
+    if (stopping() || !started() || !can_hint_for(ep)) {
        manager_logger.trace("Can't store a hint to {}", ep);
        ++_stats.dropped;
        return false;
@@ -380,7 +382,7 @@ future<timespec> manager::end_point_hints_manager::sender::get_last_file_modific
    });
 }

-future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
+future<> manager::end_point_hints_manager::sender::do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
    return futurize_apply([this, m = std::move(m), &natural_endpoints] () mutable -> future<> {
        // The fact that we send with CL::ALL in both cases below ensures that new hints are not going
        // to be generated as a result of hints sending.
@@ -392,7 +394,8 @@ future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation
            // FIXME: using 1h as infinite timeout. If a node is down, we should get an
            // unavailable exception.
            auto timeout = db::timeout_clock::now() + 1h;
-            return _proxy.mutate({std::move(m)}, consistency_level::ALL, timeout, nullptr);
+            //FIXME: Add required frozen_mutation overloads
+            return _proxy.mutate({m.fm.unfreeze(m.s)}, consistency_level::ALL, timeout, nullptr);
        }
    });
 }
@@ -418,21 +421,19 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {
    }
 }

-mutation manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
+frozen_mutation_and_schema manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
    hint_entry_reader hr(buf);
    auto& fm = hr.mutation();
    auto& cm = get_column_mapping(std::move(ctx_ptr), fm, hr);
-    auto& cf = _db.find_column_family(fm.column_family_id());
+    auto schema = _db.find_schema(fm.column_family_id());

-    if (cf.schema()->version() != fm.schema_version()) {
-        mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
-        converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
+    if (schema->version() != fm.schema_version()) {
+        mutation m(schema, fm.decorated_key(*schema));
+        converting_mutation_partition_applier v(cm, *schema, m.partition());
        fm.partition().accept(cm, v);
-
-        return std::move(m);
-    } else {
-        return fm.unfreeze(cf.schema());
+        return {freeze(m), std::move(schema)};
    }
+    return {std::move(hr).mutation(), std::move(schema)};
 }

 const column_mapping& manager::end_point_hints_manager::sender::get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr) {
@@ -502,7 +503,7 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
 }

 void manager::drain_for(gms::inet_address endpoint) {
-    if (_stopping) {
+    if (stopping()) {
        return;
    }

@@ -543,6 +544,7 @@ manager::end_point_hints_manager::sender::sender(end_point_hints_manager& parent
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(local_storage_proxy)
    , _db(local_db)
+    , _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
    , _gossiper(local_gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -555,6 +557,7 @@ manager::end_point_hints_manager::sender::sender(const sender& other, end_point_
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(other._proxy)
    , _db(other._db)
+    , _hints_cpu_sched_group(other._hints_cpu_sched_group)
    , _gossiper(other._gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -610,7 +613,10 @@ manager::end_point_hints_manager::sender::clock::duration manager::end_point_hin
 }

 void manager::end_point_hints_manager::sender::start() {
-    _stopped = seastar::async([this] {
+    seastar::thread_attributes attr;
+
+    attr.sched_group = _hints_cpu_sched_group;
+    _stopped = seastar::async(std::move(attr), [this] {
        manager_logger.trace("ep_manager({})::sender: started", end_point_key());
        while (!stopping()) {
            try {
@@ -630,10 +636,11 @@ void manager::end_point_hints_manager::sender::start() {
    });
 }

-future<> manager::end_point_hints_manager::sender::send_one_mutation(mutation m) {
-    keyspace& ks = _db.find_keyspace(m.schema()->ks_name());
+future<> manager::end_point_hints_manager::sender::send_one_mutation(frozen_mutation_and_schema m) {
+    keyspace& ks = _db.find_keyspace(m.s->ks_name());
    auto& rs = ks.get_replication_strategy();
-    std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(m.token());
+    auto token = dht::global_partitioner().get_token(*m.s, m.fm.key(*m.s));
+    std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(std::move(token));

    return do_send_one_mutation(std::move(m), natural_endpoints);
 }
@@ -651,8 +658,8 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
                    return make_ready_future<>();
                }

-                mutation m = this->get_mutation(ctx_ptr, buf);
-                gc_clock::duration gc_grace_sec = m.schema()->gc_grace_seconds();
+                auto m = this->get_mutation(ctx_ptr, buf);
+                gc_clock::duration gc_grace_sec = m.s->gc_grace_seconds();

                // The hint is too old - drop it.
                //
@@ -693,7 +700,7 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
    lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>();

    try {
-        auto s = commitlog::read_log_file(fname, [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
+        auto s = commitlog::read_log_file(fname, service::get_local_streaming_read_priority(), [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
            // Check that we can still send the next hint. Don't try to send it if the destination host
            // is DOWN or if we have already failed to send some of the previous hints.
            if (!draining() && ctx_ptr->state.contains(send_state::segment_replay_failed)) {
@@ -759,7 +766,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    int replayed_segments_count = 0;

    try {
-        while (have_segments()) {
+        while (replay_allowed() && have_segments()) {
            if (!send_one_file(*_segments_to_replay.begin())) {
                break;
            }
@@ -784,14 +791,24 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
 }

+template<typename Func>
+static future<> scan_for_hints_dirs(const sstring& hints_directory, Func&& f) {
+    return lister::scan_dir(hints_directory, { directory_entry_type::directory }, [f = std::forward<Func>(f)] (lister::path dir, directory_entry de) {
+        try {
+            return f(std::move(dir), std::move(de), std::stoi(de.name.c_str()));
+        } catch (std::invalid_argument& ex) {
+            manager_logger.debug("Ignore invalid directory {}", de.name);
+            return make_ready_future<>();
+        }
+    });
+}
+
 // runs in seastar::async context
 manager::hints_segments_map manager::get_current_hints_segments(const sstring& hints_directory) {
    hints_segments_map current_hints_segments;

    // shards level
-    lister::scan_dir(hints_directory, { directory_entry_type::directory }, [&current_hints_segments] (lister::path dir, directory_entry de) {
-        unsigned shard_id = std::stoi(de.name.c_str());
-
+    scan_for_hints_dirs(hints_directory, [&current_hints_segments] (lister::path dir, directory_entry de, unsigned shard_id) {
        manager_logger.trace("shard_id = {}", shard_id);
        // IPs level
        return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory }, [&current_hints_segments, shard_id] (lister::path dir, directory_entry de) {
@@ -908,9 +925,7 @@ void manager::rebalance_segments_for(
 // runs in seastar::async context
 void manager::remove_irrelevant_shards_directories(const sstring& hints_directory) {
    // shards level
-    lister::scan_dir(hints_directory, { directory_entry_type::directory }, [] (lister::path dir, directory_entry de) {
-        unsigned shard_id = std::stoi(de.name.c_str());
-
+    scan_for_hints_dirs(hints_directory, [] (lister::path dir, directory_entry de, unsigned shard_id) {
        if (shard_id >= smp::count) {
            // IPs level
            return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory, directory_entry_type::regular }, lister::show_hidden::yes, [] (lister::path dir, directory_entry de) {
@@ -936,5 +951,15 @@ future<> manager::rebalance(sstring hints_directory) {
    });
 }

+void manager::update_backlog(size_t backlog, size_t max_backlog) {
+    _backlog_size = backlog;
+    _max_backlog_size = max_backlog;
+    if (backlog < max_backlog) {
+        allow_hints();
+    } else {
+        forbid_hints_for_eps_with_pending_hints();
+    }
+}
+
 }
 }
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -69,6 +69,8 @@ private:
    class drain_tag {};
    using drain = seastar::bool_class<drain_tag>;

+    friend class space_watchdog;
+
 public:
    class end_point_hints_manager {
    public:
@@ -119,6 +121,7 @@ public:
            resource_manager& _resource_manager;
            service::storage_proxy& _proxy;
            database& _db;
+            seastar::scheduling_group _hints_cpu_sched_group;
            gms::gossiper& _gossiper;
            seastar::shared_mutex& _file_update_mutex;

@@ -179,6 +182,10 @@ public:
                return _state.contains(state::stopping);
            }

+            bool replay_allowed() const noexcept {
+                return _ep_manager.replay_allowed();
+            }
+
            /// \brief Try to send one hint read from the file.
            ///  - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
            ///  - Discard the hints that are older than the grace seconds value of the corresponding table.
@@ -210,7 +217,7 @@ public:
            /// \param ctx_ptr pointer to the send context
            /// \param buf hints file entry
            /// \return The mutation object representing the original mutation stored in the hints file.
-            mutation get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
+            frozen_mutation_and_schema get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);

            /// \brief Get a reference to the column_mapping object for a given frozen mutation.
            /// \param ctx_ptr pointer to the send context
@@ -227,13 +234,13 @@ public:
            /// \param m mutation to send
            /// \param natural_endpoints current replicas for the given mutation
            /// \return future that resolves when the operation is complete
-            future<> do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
+            future<> do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;

            /// \brief Send one mutation out.
            ///
            /// \param m mutation to send
            /// \return future that resolves when the mutation sending processing is complete.
-            future<> send_one_mutation(mutation m);
+            future<> send_one_mutation(frozen_mutation_and_schema m);

            /// \brief Get the last modification time stamp for a given file.
            /// \param fname File name
@@ -328,6 +335,10 @@ public:
            return _hints_in_progress;
        }

+        bool replay_allowed() const noexcept {
+            return _shard_manager.replay_allowed();
+        }
+
        bool can_hint() const noexcept {
            return _state.contains(state::can_hint);
        }
@@ -393,6 +404,17 @@ public:
        }
    };

+    enum class state {
+        started,                // hinting is currently allowed (start() call is complete)
+        replay_allowed,         // replaying (hints sending) is allowed
+        stopping                // hinting is not allowed - stopping is in progress (stop() method has been called)
+    };
+
+    using state_set = enum_set<super_enum<state,
+        state::started,
+        state::replay_allowed,
+        state::stopping>>;
+
 private:
    using ep_key_type = typename end_point_hints_manager::key_type;
    using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;
@@ -403,6 +425,7 @@ public:
    static const std::chrono::seconds hint_file_write_timeout;

 private:
+    state_set _state;
    const boost::filesystem::path _hints_dir;
    dev_t _hints_dir_device_id = 0;

@@ -414,7 +437,7 @@ private:
    locator::snitch_ptr& _local_snitch_ptr;
    int64_t _max_hint_window_us = 0;
    database& _local_db;
-    bool _stopping = false;
+
    seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call

    resource_manager& _resource_manager;
@@ -424,9 +447,14 @@ private:
    seastar::metrics::metric_groups _metrics;
    std::unordered_set<ep_key_type> _eps_with_pending_hints;

+    size_t _max_backlog_size = 1;
+    size_t _backlog_size = 0;
+
 public:
    manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
    virtual ~manager();
+    manager(manager&&) = delete;
+    manager& operator=(manager&&) = delete;
    void register_metrics(const sstring& group_name);
    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
    future<> stop();
@@ -503,6 +531,18 @@ public:
    void forbid_hints();
    void forbid_hints_for_eps_with_pending_hints();

+    size_t max_backlog_size() const {
+        return _max_backlog_size;
+    }
+
+    size_t backlog_size() const {
+        return _backlog_size;
+    }
+
+    void allow_replaying() noexcept {
+        _state.set(state::replay_allowed);
+    }
+
    /// \brief Rebalance hints segments among all present shards.
    ///
    /// The difference between the number of segments on every two shard will be not greater than 1 after the
@@ -616,6 +656,28 @@ private:
    /// \param endpoint node that left the cluster
    void drain_for(gms::inet_address endpoint);

+    void update_backlog(size_t backlog, size_t max_backlog);
+
+    bool stopping() const noexcept {
+        return _state.contains(state::stopping);
+    }
+
+    void set_stopping() noexcept {
+        _state.set(state::stopping);
+    }
+
+    bool started() const noexcept {
+        return _state.contains(state::started);
+    }
+
+    void set_started() noexcept {
+        _state.set(state::started);
+    }
+
+    bool replay_allowed() const noexcept {
+        return _state.contains(state::replay_allowed);
+    }
+
 public:
    ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
        return _ep_managers.find(ep_key);
--- a/db/hints/resource_manager.cc
+++ b/db/hints/resource_manager.cc
@@ -27,6 +27,7 @@
 #include "lister.hh"
 #include "disk-error-handler.hh"
 #include "seastarx.hh"
+#include <seastar/core/sleep.hh>

 namespace db {
 namespace hints {
@@ -65,19 +66,28 @@ const std::chrono::seconds space_watchdog::_watchdog_period = std::chrono::secon
 space_watchdog::space_watchdog(shard_managers_set& managers, per_device_limits_map& per_device_limits_map)
    : _shard_managers(managers)
    , _per_device_limits_map(per_device_limits_map)
-    , _timer([this] { on_timer(); })
 {}

 void space_watchdog::start() {
-    _timer.arm(timer_clock_type::now());
+    _started = seastar::async([this] {
+        while (!_as.abort_requested()) {
+            try {
+                on_timer();
+            } catch (...) {
+                resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
+                // Stop all hint generators if space_watchdog callback failed
+                for (manager& shard_manager : _shard_managers) {
+                    shard_manager.forbid_hints();
+                }
+            }
+            seastar::sleep_abortable(_watchdog_period, _as).get();
+        }
+    }).handle_exception_type([] (const seastar::sleep_aborted& ignored) { });
 }

 future<> space_watchdog::stop() noexcept {
-    try {
-        return _gate.close().finally([this] { _timer.cancel(); });
-    } catch (...) {
-        return make_exception_future<>(std::current_exception());
-    }
+    _as.request_abort();
+    return std::move(_started);
 }

 future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
@@ -94,83 +104,62 @@ future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager&
    });
 }

+// Called from the context of a seastar::thread.
 void space_watchdog::on_timer() {
-    with_gate(_gate, [this] {
-        return futurize_apply([this] {
-            _total_size = 0;
+    // The hints directories are organized as follows:
+    // <hints root>
+    //    |- <shard1 ID>
+    //    |  |- <EP1 address>
+    //    |     |- <hints file1>
+    //    |     |- <hints file2>
+    //    |     |- ...
+    //    |  |- <EP2 address>
+    //    |     |- ...
+    //    |  |-...
+    //    |- <shard2 ID>
+    //    |  |- ...
+    //    ...
+    //    |- <shardN ID>
+    //    |  |- ...
+    //

-            return do_for_each(_shard_managers, [this] (manager& shard_manager) {
-                shard_manager.clear_eps_with_pending_hints();
-
-                // The hints directories are organized as follows:
-                // <hints root>
-                //    |- <shard1 ID>
-                //    |  |- <EP1 address>
-                //    |     |- <hints file1>
-                //    |     |- <hints file2>
-                //    |     |- ...
-                //    |  |- <EP2 address>
-                //    |     |- ...
-                //    |  |-...
-                //    |- <shard2 ID>
-                //    |  |- ...
-                //    ...
-                //    |- <shardN ID>
-                //    |  |- ...
+    for (auto& per_device_limits : _per_device_limits_map | boost::adaptors::map_values) {
+        _total_size = 0;
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.clear_eps_with_pending_hints();
+            lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
+                _files_count = 0;
+                // Let's scan per-end-point directories and enumerate hints files...
                //
-                return lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
-                    _files_count = 0;
-                    // Let's scan per-end-point directories and enumerate hints files...
-                    //
-                    // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
-                    // not hintable).
-                    // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
-                    // continue to enumeration - there is no one to change them.
-                    auto it = shard_manager.find_ep_manager(de.name);
-                    if (it != shard_manager.ep_managers_end()) {
-                        return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
-                             return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
-                        });
-                    } else {
-                        return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
-                    }
-                });
-            }).then([this] {
-                return do_for_each(_per_device_limits_map, [this](per_device_limits_map::value_type& per_device_limits_entry) {
-                    space_watchdog::per_device_limits& per_device_limits = per_device_limits_entry.second;
-
-                    size_t adjusted_quota = 0;
-                    size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
-                        return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
+                // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
+                // not hintable).
+                // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
+                // continue to enumeration - there is no one to change them.
+                auto it = shard_manager.find_ep_manager(de.name);
+                if (it != shard_manager.ep_managers_end()) {
+                    return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
+                        return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
                    });
-                    if (per_device_limits.max_shard_disk_space_size > delta) {
-                        adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
-                    }
+                } else {
+                    return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
+                }
+            }).get();
+        }

-                    bool can_hint = _total_size < adjusted_quota;
-                    resource_manager_logger.trace("space_watchdog: total_size ({}) {} max_shard_disk_space_size ({})", _total_size, can_hint ? "<" : ">=", adjusted_quota);
-
-                    if (!can_hint) {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.forbid_hints_for_eps_with_pending_hints();
-                        }
-                    } else {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.allow_hints();
-                        }
-    }
-                });
-            });
-        }).handle_exception([this] (auto eptr) {
-            resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
-            // Stop all hint generators if space_watchdog callback failed
-            for (manager& shard_manager : _shard_managers) {
-                shard_manager.forbid_hints();
-            }
-        }).finally([this] {
-            _timer.arm(_watchdog_period);
+        // Adjust the quota to take into account the space we guarantee to every end point manager
+        size_t adjusted_quota = 0;
+        size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
+            return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
        });
-    });
+        if (per_device_limits.max_shard_disk_space_size > delta) {
+            adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
+        }
+
+        resource_manager_logger.trace("space_watchdog: consuming {}/{} bytes", _total_size, adjusted_quota);
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.update_backlog(_total_size, adjusted_quota);
+        }
+    }
 }

 future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr) {
@@ -183,6 +172,10 @@ future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, s
    });
 }

+void resource_manager::allow_replaying() noexcept {
+    boost::for_each(_shard_managers, [] (manager& m) { m.allow_replaying(); });
+}
+
 future<> resource_manager::stop() noexcept {
    return parallel_for_each(_shard_managers, [](manager& m) {
        return m.stop();
@@ -201,14 +194,18 @@ future<> resource_manager::prepare_per_device_limits() {
        auto it = _per_device_limits_map.find(device_id);
        if (it == _per_device_limits_map.end()) {
            return is_mountpoint(shard_manager.hints_dir().parent_path()).then([this, device_id, &shard_manager](bool is_mountpoint) {
-                // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
-                size_t max_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
-                // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
-                // Then, reserve 90% of all space instead of 10% above.
-                if (is_mountpoint) {
-                    max_size *= 9;
+                auto [it, inserted] = _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{});
+                // Since we possibly deferred, we need to recheck the _per_device_limits_map.
+                if (inserted) {
+                    // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
+                    it->second.max_shard_disk_space_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
+                    // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
+                    // Then, reserve 90% of all space instead of 10% above.
+                    if (is_mountpoint) {
+                        it->second.max_shard_disk_space_size *= 9;
+                    }
                }
-                _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{{std::ref(shard_manager)}, max_size});
+                it->second.managers.emplace_back(std::ref(shard_manager));
            });
        } else {
            it->second.managers.emplace_back(std::ref(shard_manager));
--- a/db/hints/resource_manager.hh
+++ b/db/hints/resource_manager.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include <cstdint>
+#include <seastar/core/abort_source.hh>
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/gate.hh>
 #include <seastar/core/memory.hh>
@@ -78,8 +79,8 @@ private:
    shard_managers_set& _shard_managers;
    per_device_limits_map& _per_device_limits_map;

-    seastar::gate _gate;
-    seastar::timer<timer_clock_type> _timer;
+    future<> _started = make_ready_future<>();
+    seastar::abort_source _as;
    int _files_count = 0;

 public:
@@ -137,6 +138,9 @@ public:
        , _space_watchdog(_shard_managers, _per_device_limits_map)
    {}

+    resource_manager(resource_manager&&) = delete;
+    resource_manager& operator=(resource_manager&&) = delete;
+
    future<semaphore_units<semaphore_default_exception_factory>> get_send_units_for(size_t buf_size);

    bool too_many_hints_in_progress() const {
@@ -156,6 +160,7 @@ public:
    }

    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
+    void allow_replaying() noexcept;
    future<> stop() noexcept;
    void register_manager(manager& m);
    future<> prepare_per_device_limits();
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -87,7 +87,7 @@ future<> system_distributed_keyspace::start() {
        return do_with(all_tables(), [this] (std::vector<schema_ptr>& tables) {
            return do_for_each(tables, [this] (schema_ptr table) {
                return ignore_existing([this, table = std::move(table)] {
-                    return _mm.announce_new_column_family(std::move(table), false);
+                    return _mm.announce_new_column_family(std::move(table), api::min_timestamp, false);
                });
            });
        });
--- a/db/timeout_clock.hh
+++ b/db/timeout_clock.hh
@@ -28,5 +28,6 @@
 namespace db {
 using timeout_clock = seastar::lowres_clock;
 using timeout_semaphore = seastar::basic_semaphore<seastar::default_timeout_exception_factory, timeout_clock>;
+using timeout_semaphore_units = seastar::semaphore_units<seastar::default_timeout_exception_factory, timeout_clock>;
 static constexpr timeout_clock::time_point no_timeout = timeout_clock::time_point::max();
 }
--- a/db/view/node_view_update_backlog.hh
+++ b/db/view/node_view_update_backlog.hh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "db/view/view_update_backlog.hh"
+
+#include <seastar/core/cacheline.hh>
+#include <seastar/core/lowres_clock.hh>
+
+#include <atomic>
+#include <chrono>
+#include <new>
+
+namespace db::view {
+
+/**
+ * An atomic view update backlog representation, safe to update from multiple shards.
+ * It is legal for a stale current max value to be returned.
+ */
+class node_update_backlog {
+    using clock = seastar::lowres_clock;
+    struct per_shard_backlog {
+        // Multiply by 2 to defeat the prefetcher
+        alignas(seastar::cache_line_size * 2) std::atomic<update_backlog> backlog = update_backlog::no_backlog();
+
+        update_backlog load() const {
+            return backlog.load(std::memory_order_relaxed);
+        }
+    };
+    std::vector<per_shard_backlog> _backlogs;
+    std::chrono::milliseconds _interval;
+    std::atomic<clock::time_point> _last_update;
+    std::atomic<update_backlog> _max;
+
+public:
+    explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval)
+            : _backlogs(shards)
+            , _interval(interval)
+            , _last_update(clock::now() - _interval)
+            , _max(update_backlog::no_backlog()) {
+    }
+
+    update_backlog add_fetch(unsigned shard, update_backlog backlog);
+
+    // Exposed for testing only.
+    update_backlog load() const {
+        return _max.load(std::memory_order_relaxed);
+    }
+};
+
+}
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -58,6 +58,7 @@
 #include "cql3/util.hh"
 #include "db/view/view.hh"
 #include "db/view/view_builder.hh"
+#include "frozen_mutation.hh"
 #include "gms/inet_address.hh"
 #include "keys.hh"
 #include "locator/network_topology_strategy.hh"
@@ -226,10 +227,11 @@ public:
            , _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
    }

-    void move_to(std::vector<mutation>& mutations) && {
+    void move_to(std::vector<frozen_mutation_and_schema>& mutations) && {
        auto& partitioner = dht::global_partitioner();
        std::transform(_updates.begin(), _updates.end(), std::back_inserter(mutations), [&, this] (auto&& m) {
-            return mutation(_view, partitioner.decorate_key(*_view, std::move(m.first)), std::move(m.second));
+            auto mut = mutation(_view, partitioner.decorate_key(*_view, std::move(m.first)), std::move(m.second));
+            return frozen_mutation_and_schema{freeze(mut), std::move(_view)};
        });
    }

@@ -627,7 +629,7 @@ public:
            , _now(gc_clock::now()) {
    }

-    future<std::vector<mutation>> build();
+    future<std::vector<frozen_mutation_and_schema>> build();

 private:
    void generate_update(clustering_row&& update, stdx::optional<clustering_row>&& existing);
@@ -664,7 +666,7 @@ private:
    }
 };

-future<std::vector<mutation>> view_update_builder::build() {
+future<std::vector<frozen_mutation_and_schema>> view_update_builder::build() {
    return advance_all().then([this] (auto&& ignored) {
        assert(_update && _update->is_partition_start());
        _key = std::move(std::move(_update)->as_partition_start().key().key());
@@ -679,7 +681,7 @@ future<std::vector<mutation>> view_update_builder::build() {
            });
        });
    }).then([this] {
-        std::vector<mutation> mutations;
+        std::vector<frozen_mutation_and_schema> mutations;
        for (auto&& update : _view_updates) {
            std::move(update).move_to(mutations);
        }
@@ -787,7 +789,7 @@ future<stop_iteration> view_update_builder::on_results() {
    return stop();
 }

-future<std::vector<mutation>> generate_view_updates(
+future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
        std::vector<view_ptr>&& views_to_update,
        flat_mutation_reader&& updates,
@@ -924,16 +926,35 @@ get_view_natural_endpoint(const sstring& keyspace_name,
 // to a modification of a single base partition, and apply them to the
 // appropriate paired replicas. This is done asynchronously - we do not wait
 // for the writes to complete.
-// FIXME: I dropped a lot of parameters the Cassandra version had,
-// we may need them back: writeCommitLog, baseComplete, queryStartNanoTime.
-future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations, db::view::stats& stats)
+future<> mutate_MV(
+        const dht::token& base_token,
+        std::vector<frozen_mutation_and_schema> view_updates,
+        db::view::stats& stats,
+        db::timeout_semaphore_units pending_view_updates)
 {
    auto fs = std::make_unique<std::vector<future<>>>();
-    for (auto& mut : mutations) {
-        auto view_token = mut.token();
-        auto keyspace_name = mut.schema()->ks_name();
+    fs->reserve(view_updates.size());
+    auto& partitioner = dht::global_partitioner();
+    for (frozen_mutation_and_schema& mut : view_updates) {
+        auto view_token = partitioner.get_token(*mut.s, mut.fm.key(*mut.s));
+        auto& keyspace_name = mut.s->ks_name();
        auto paired_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
        auto pending_endpoints = service::get_local_storage_service().get_token_metadata().pending_endpoints_for(view_token, keyspace_name);
+        auto maybe_account_failure = [&stats, units = pending_view_updates.split(mut.fm.representation().size())] (
+                future<>&& f,
+                gms::inet_address target,
+                bool is_local,
+                size_t remotes) {
+            if (f.failed()) {
+                stats.view_updates_failed_local += is_local;
+                stats.view_updates_failed_remote += remotes;
+                auto ep = f.get_exception();
+                vlogger.error("Error applying view update to {}: {}", target, ep);
+                return make_exception_future<>(std::move(ep));
+            } else {
+                return make_ready_future<>();
+            }
+        };
        if (paired_endpoint) {
            // When paired endpoint is the local node, we can just apply
            // the mutation locally, unless there are pending endpoints, in
@@ -951,10 +972,16 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
                // do not wait for it to complete.
                // Note also that mutate_locally(mut) copies mut (in
                // frozen form) so don't need to increase its lifetime.
-                fs->push_back(service::get_local_storage_proxy().mutate_locally(mut).handle_exception([&stats] (auto ep) {
-                    vlogger.error("Error applying local view update: {}", ep);
-                    stats.view_updates_failed_local++;
-                    return make_exception_future<>(std::move(ep));
+                // send_to_endpoint() below updates statistics on pending
+                // writes but mutate_locally() doesn't, so we need to do that here.
+                ++stats.writes;
+                auto mut_ptr = std::make_unique<frozen_mutation>(std::move(mut.fm));
+                fs->push_back(service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr).then_wrapped(
+                        [&stats,
+                         maybe_account_failure = std::move(maybe_account_failure),
+                         mut_ptr = std::move(mut_ptr)] (future<>&& f) {
+                    --stats.writes;
+                    return maybe_account_failure(std::move(f), utils::fb_utilities::get_broadcast_address(), true, 0);
                }));
            } else {
                vlogger.debug("Sending view update to endpoint {}, with pending endpoints = {}", *paired_endpoint, pending_endpoints);
@@ -965,14 +992,17 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
                // to send the update there. Currently, we do this from *each* of
                // the base replicas, but this is probably excessive - see
                // See https://issues.apache.org/jira/browse/CASSANDRA-14262/
-                fs->push_back(service::get_local_storage_proxy().send_to_endpoint(std::move(mut), *paired_endpoint, std::move(pending_endpoints), db::write_type::VIEW, stats)
-                        .handle_exception([paired_endpoint, is_endpoint_local, updates_pushed_remote, &stats] (auto ep) {
-                            stats.view_updates_failed_local += is_endpoint_local;
-                            stats.view_updates_failed_remote += updates_pushed_remote;
-                            vlogger.error("Error applying view update to {}: {}", *paired_endpoint, ep);
-                            return make_exception_future<>(std::move(ep));
-                        })
-                );
+                fs->push_back(service::get_local_storage_proxy().send_to_endpoint(
+                        std::move(mut),
+                        *paired_endpoint,
+                        std::move(pending_endpoints),
+                        db::write_type::VIEW, stats).then_wrapped(
+                                [paired_endpoint,
+                                 is_endpoint_local,
+                                 updates_pushed_remote,
+                                 maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) mutable {
+                    return maybe_account_failure(std::move(f), std::move(*paired_endpoint), is_endpoint_local, updates_pushed_remote);
+                }));
            }
        } else if (!pending_endpoints.empty()) {
            // If there is no paired endpoint, it means there's a range movement going on (decommission or move),
@@ -992,10 +1022,11 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
                    std::move(mut),
                    target,
                    std::move(pending_endpoints),
-                    db::write_type::VIEW).handle_exception([target, updates_pushed_remote, &stats] (auto ep) {
-                stats.view_updates_failed_remote += updates_pushed_remote;
-                vlogger.error("Error applying view update to {}: {}", target, ep);
-                return make_exception_future<>(std::move(ep));
+                    db::write_type::VIEW).then_wrapped(
+                            [target,
+                             updates_pushed_remote,
+                             maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) {
+                return maybe_account_failure(std::move(f), std::move(target), false, updates_pushed_remote);
            }));
        }
    }
@@ -1226,6 +1257,20 @@ future<> view_builder::calculate_shard_build_step(
        }
    }

+    // All shards need to arrive at the same decisions on whether or not to
+    // restart a view build at some common token (reshard), and which token
+    // to restart at. So we need to wait until all shards have read the view
+    // build statuses before they can all proceed to make the (same) decision.
+    // If we don't synchronoize here, a fast shard may make a decision, start
+    // building and finish a build step - before the slowest shard even read
+    // the view build information.
+    container().invoke_on(0, [] (view_builder& builder) {
+        if (++builder._shards_finished_read == smp::count) {
+            builder._shards_finished_read_promise.set_value();
+        }
+        return builder._shards_finished_read_promise.get_shared_future();
+    }).get();
+
    std::unordered_set<utils::UUID> loaded_views;
    if (view_build_status_per_shard.size() != smp::count) {
        reshard(std::move(view_build_status_per_shard), loaded_views);
@@ -1591,12 +1636,29 @@ future<> view_builder::maybe_mark_view_as_built(view_ptr view, dht::token next_t
    });
 }

-future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout) {
-    return container().invoke_on(0, [ks_name, view_name, timeout] (view_builder& builder) {
+future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name) {
+    return container().invoke_on(0, [ks_name, view_name] (view_builder& builder) {
        auto v = std::pair(std::move(ks_name), std::move(view_name));
-        return builder._build_notifiers[std::move(v)].get_shared_future(timeout);
+        return builder._build_notifiers[std::move(v)].get_shared_future();
    });
 }

+update_backlog node_update_backlog::add_fetch(unsigned shard, update_backlog backlog) {
+    _backlogs[shard].backlog.store(backlog, std::memory_order_relaxed);
+    auto now = clock::now();
+    if (now >= _last_update.load(std::memory_order_relaxed) + _interval) {
+        _last_update.store(now, std::memory_order_relaxed);
+        auto new_max = boost::accumulate(
+                _backlogs,
+                update_backlog::no_backlog(),
+                [] (const update_backlog& lhs, const per_shard_backlog& rhs) {
+                    return std::max(lhs, rhs.load());
+                });
+        _max.store(new_max, std::memory_order_relaxed);
+        return new_max;
+    }
+    return std::max(backlog, _max.load(std::memory_order_relaxed));
+}
+
 } // namespace view
 } // namespace db
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -30,6 +30,10 @@
 #include "flat_mutation_reader.hh"
 #include "stdx.hh"

+#include <seastar/core/semaphore.hh>
+
+class frozen_mutation_and_schema;
+
 namespace db {

 namespace view {
@@ -90,7 +94,7 @@ bool matches_view_filter(const schema& base, const view_info& view, const partit

 bool clustering_prefix_matches(const schema& base, const partition_key& key, const clustering_key_prefix& ck);

-future<std::vector<mutation>> generate_view_updates(
+future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
        std::vector<view_ptr>&& views_to_update,
        flat_mutation_reader&& updates,
@@ -102,7 +106,11 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
        const mutation_partition& mp,
        const std::vector<view_ptr>& views);

-future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations, db::view::stats& stats);
+future<> mutate_MV(
+        const dht::token& base_token,
+        std::vector<frozen_mutation_and_schema> view_updates,
+        db::view::stats& stats,
+        db::timeout_semaphore_units pending_view_updates);

 /**
 * create_virtual_column() adds a "virtual column" to a schema builder.
--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -151,6 +151,10 @@ class view_builder final : public service::migration_listener::only_view_notific
    future<> _started = make_ready_future<>();
    // Used to coordinate between shards the conclusion of the build process for a particular view.
    std::unordered_set<utils::UUID> _built_views;
+    // Counter and promise (both on shard 0 only!) allowing to wait for all
+    // shards to have read the view build statuses
+    unsigned _shards_finished_read = 0;
+    seastar::shared_promise<> _shards_finished_read_promise;
    // Used for testing.
    std::unordered_map<std::pair<sstring, sstring>, seastar::shared_promise<>, utils::tuple_hash> _build_notifiers;

@@ -178,7 +182,7 @@ public:
    virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override;

    // For tests
-    future<> wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout);
+    future<> wait_until_built(const sstring& ks_name, const sstring& view_name);

 private:
    build_step& get_or_create_build_step(utils::UUID);
--- a/db/view/view_update_backlog.hh
+++ b/db/view/view_update_backlog.hh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <limits>
+
+namespace db::view {
+
+/**
+ * The view update backlog represents the pending view data that a base replica
+ * maintains. It is the maximum of the memory backlog - how much memory pending
+ * view updates are consuming out of the their allocated quota - and the disk
+ * backlog - how much view hints are consuming. The size of a backlog is relative
+ * to its maximum size.
+ */
+struct update_backlog {
+    size_t current;
+    size_t max;
+
+    float relative_size() const {
+        return float(current) / float(max);
+    }
+
+    friend bool operator==(const update_backlog& lhs, const update_backlog& rhs) {
+        return lhs.relative_size() == rhs.relative_size();
+    }
+
+    friend bool operator<(const update_backlog& lhs, const update_backlog& rhs) {
+        return lhs.relative_size() < rhs.relative_size();
+    }
+
+    friend bool operator!=(const update_backlog& lhs, const update_backlog& rhs) {
+        return !(lhs == rhs);
+    }
+
+    friend bool operator<=(const update_backlog& lhs, const update_backlog& rhs) {
+        return !(rhs < lhs);
+    }
+
+    friend bool operator>(const update_backlog& lhs, const update_backlog& rhs) {
+        return rhs < lhs;
+    }
+
+    friend bool operator>=(const update_backlog& lhs, const update_backlog& rhs) {
+        return !(lhs < rhs);
+    }
+
+    static update_backlog no_backlog() {
+        return update_backlog{0, std::numeric_limits<size_t>::max()};
+    }
+};
+
+}
--- a/db/view/view_update_from_staging_generator.cc
+++ b/db/view/view_update_from_staging_generator.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "view_update_from_staging_generator.hh"
+
+namespace db::view {
+
+future<> view_update_from_staging_generator::start() {
+    _started = seastar::async([this]() mutable {
+        while (!_as.abort_requested()) {
+            if (_sstables_with_tables.empty()) {
+                _pending_sstables.wait().get();
+            }
+            while (!_sstables_with_tables.empty()) {
+                auto& entry = _sstables_with_tables.front();
+                schema_ptr s = entry.t->schema();
+                flat_mutation_reader staging_sstable_reader = entry.sst->read_rows_flat(s);
+                auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, _proxy, entry.sst, _as), db::no_timeout);
+                if (result == stop_iteration::yes) {
+                    break;
+                }
+                entry.t->move_sstable_from_staging_in_thread(entry.sst);
+                _registration_sem.signal();
+                _sstables_with_tables.pop_front();
+            }
+        }
+    });
+    return make_ready_future<>();
+}
+
+future<> view_update_from_staging_generator::stop() {
+    _as.request_abort();
+    _pending_sstables.signal();
+    return std::move(_started).then([this] {
+        _registration_sem.broken();
+    });
+}
+
+future<> view_update_from_staging_generator::register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table) {
+    if (_as.abort_requested()) {
+        return make_ready_future<>();
+    }
+    _sstables_with_tables.emplace_back(std::move(sst), std::move(table));
+    _pending_sstables.signal();
+    return _registration_sem.wait(1);
+}
+
+}
--- a/db/view/view_update_from_staging_generator.hh
+++ b/db/view/view_update_from_staging_generator.hh
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "database.hh"
+#include "sstables/sstables.hh"
+#include "db/view/view_updating_consumer.hh"
+
+#include <seastar/core/abort_source.hh>
+#include <seastar/core/condition-variable.hh>
+#include <seastar/core/semaphore.hh>
+
+namespace db::view {
+
+class view_update_from_staging_generator {
+    static constexpr size_t registration_queue_size = 5;
+    database& _db;
+    service::storage_proxy& _proxy;
+    seastar::abort_source _as;
+    future<> _started = make_ready_future<>();
+    seastar::condition_variable _pending_sstables;
+    semaphore _registration_sem{registration_queue_size};
+    struct sstable_with_table {
+        sstables::shared_sstable sst;
+        lw_shared_ptr<table> t;
+        sstable_with_table(sstables::shared_sstable sst, lw_shared_ptr<table> t) : sst(sst), t(t) { }
+    };
+    std::deque<sstable_with_table> _sstables_with_tables;
+public:
+    view_update_from_staging_generator(database& db, service::storage_proxy& proxy) : _db(db), _proxy(proxy) { }
+
+    future<> start();
+    future<> stop();
+    future<> register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table);
+};
+
+}
--- a/db/view/view_updating_consumer.hh
+++ b/db/view/view_updating_consumer.hh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "service/storage_proxy.hh"
+#include "dht/i_partitioner.hh"
+#include "schema.hh"
+#include "mutation_fragment.hh"
+#include "sstables/shared_sstable.hh"
+
+namespace db::view {
+
+/*
+ * A consumer that pushes materialized view updates for each consumed mutation.
+ * It is expected to be run in seastar::async threaded context through consume_in_thread()
+ */
+class view_updating_consumer {
+    schema_ptr _schema;
+    lw_shared_ptr<table> _table;
+    sstables::shared_sstable _excluded_sstable;
+    const seastar::abort_source& _as;
+    std::optional<mutation> _m;
+public:
+    view_updating_consumer(schema_ptr schema, service::storage_proxy& proxy, sstables::shared_sstable excluded_sstable, const seastar::abort_source& as)
+            : _schema(std::move(schema))
+            , _table(proxy.get_db().local().find_column_family(_schema->id()).shared_from_this())
+            , _excluded_sstable(excluded_sstable)
+            , _as(as)
+            , _m()
+    { }
+
+    void consume_new_partition(const dht::decorated_key& dk) {
+        _m = mutation(_schema, dk, mutation_partition(_schema));
+    }
+
+    void consume(tombstone t) {
+        _m->partition().apply(std::move(t));
+    }
+
+    stop_iteration consume(static_row&& sr) {
+        if (_as.abort_requested()) {
+            return stop_iteration::yes;
+        }
+        _m->partition().apply(*_schema, std::move(sr));
+        return stop_iteration::no;
+    }
+
+    stop_iteration consume(clustering_row&& cr) {
+        if (_as.abort_requested()) {
+            return stop_iteration::yes;
+        }
+        _m->partition().apply(*_schema, std::move(cr));
+        return stop_iteration::no;
+    }
+
+    stop_iteration consume(range_tombstone&& rt) {
+        if (_as.abort_requested()) {
+            return stop_iteration::yes;
+        }
+        _m->partition().apply(*_schema, std::move(rt));
+        return stop_iteration::no;
+    }
+
+    // Expected to be run in seastar::async threaded context (consume_in_thread())
+    stop_iteration consume_end_of_partition();
+
+    stop_iteration consume_end_of_stream() {
+        return stop_iteration(_as.abort_requested());
+    }
+};
+
+}
+
--- a/dht/boot_strapper.cc
+++ b/dht/boot_strapper.cc
@@ -49,7 +49,7 @@ namespace dht {
 future<> boot_strapper::bootstrap() {
    blogger.debug("Beginning bootstrap process: sorted_tokens={}", _token_metadata.sorted_tokens());

-    auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap");
+    auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap", streaming::stream_reason::bootstrap);
    streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_failure_detector()));
    for (const auto& keyspace_name : _db.local().get_non_system_keyspaces()) {
        auto& ks = _db.local().find_keyspace(keyspace_name);
--- a/dht/range_streamer.cc
+++ b/dht/range_streamer.cc
@@ -294,7 +294,7 @@ future<> range_streamer::do_stream_async() {
                size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
                dht::token_range_vector ranges_to_stream;
                auto do_streaming = [&] {
-                    auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++));
+                    auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++), _reason);
                    logger.info("{} with {} for keyspace={}, {} out of {} ranges: ranges = {}",
                            description, source, keyspace, nr_ranges_streamed, nr_ranges_total, ranges_to_stream.size());
                    if (_nr_rx_added) {
--- a/dht/range_streamer.hh
+++ b/dht/range_streamer.hh
@@ -42,6 +42,7 @@
 #include "locator/snitch_base.hh"
 #include "streaming/stream_plan.hh"
 #include "streaming/stream_state.hh"
+#include "streaming/stream_reason.hh"
 #include "gms/inet_address.hh"
 #include "gms/i_failure_detector.hh"
 #include "range.hh"
@@ -101,17 +102,18 @@ public:
        }
    };

-    range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description)
+    range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description, streaming::stream_reason reason)
        : _db(db)
        , _metadata(tm)
        , _tokens(std::move(tokens))
        , _address(address)
        , _description(std::move(description))
+        , _reason(reason)
        , _stream_plan(_description) {
    }

-    range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description)
-        : range_streamer(db, tm, std::unordered_set<token>(), address, description) {
+    range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description, streaming::stream_reason reason)
+        : range_streamer(db, tm, std::unordered_set<token>(), address, description, reason) {
    }

    void add_source_filter(std::unique_ptr<i_source_filter> filter) {
@@ -166,6 +168,7 @@ private:
    std::unordered_set<token> _tokens;
    inet_address _address;
    sstring _description;
+    streaming::stream_reason _reason;
    std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_stream;
    std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
    stream_plan _stream_plan;
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -78,7 +78,7 @@ if [ $LOCALRPM -eq 1 ]; then
    fi
    if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
        cd build
-        git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
+        git clone -b branch-3.0 --depth 1 https://github.com/scylladb/scylla-jmx.git
        cd scylla-jmx
        dist/redhat/build_rpm.sh --target epel-7-x86_64
        cd ../..
--- a/dist/common/scripts/node_exporter_install
+++ b/dist/common/scripts/node_exporter_install
@@ -25,7 +25,7 @@ import tempfile
 import tarfile
 from scylla_util import *

-VERSION='0.14.0'
+VERSION='0.17.0'
 INSTALL_DIR='/usr/lib/scylla/Prometheus/node_exporter'

 if __name__ == '__main__':
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -62,10 +62,9 @@ if __name__ == '__main__':
            run('hugeadm --create-mounts')
        fi
    else:
-        set_nic = cfg.get('SET_NIC')
+        set_nic_and_disks = get_set_nic_and_disks_config_value(cfg)
        ifname = cfg.get('IFNAME')
-        if set_nic  == 'yes':
+        if set_nic_and_disks == 'yes':
            create_perftune_conf(ifname)
-            run('/usr/lib/scylla/posix_net_conf.sh {IFNAME} --options-file /etc/scylla.d/perftune.yaml'.format(IFNAME=ifname))
+            run("{} --options-file /etc/scylla.d/perftune.yaml".format(perftune_base_command()))

-    run('/usr/lib/scylla/scylla-blocktune')
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -122,8 +122,8 @@ if __name__ == '__main__':
                        help='specify NTP domain')
    parser.add_argument('--ami', action='store_true', default=False,
                        help='setup AMI instance')
-    parser.add_argument('--setup-nic', action='store_true', default=False,
-                        help='optimize NIC queue')
+    parser.add_argument('--setup-nic-and-disks', action='store_true', default=False,
+                        help='optimize NIC and disks')
    parser.add_argument('--developer-mode', action='store_true', default=False,
                        help='enable developer mode')
    parser.add_argument('--no-ec2-check', action='store_true', default=False,
@@ -173,7 +173,7 @@ if __name__ == '__main__':

    disks = args.disks
    nic = args.nic
-    set_nic = args.setup_nic
+    set_nic_and_disks = args.setup_nic_and_disks
    ec2_check = not args.no_ec2_check
    kernel_check = not args.no_kernel_check
    verify_package = not args.no_verify_package
@@ -336,11 +336,11 @@ if __name__ == '__main__':
    if interactive:
        sysconfig_setup = interactive_ask_service('Do you want to setup a system-wide customized configuration for Scylla?', 'Yes - setup the sysconfig file. No - skips this step.', 'yes')
    if sysconfig_setup:
-        nic = interactive_choose_nic()
        if interactive:
-            set_nic = interactive_ask_service('Do you want to enable Network Interface Card (NIC) optimization?', 'Yes - optimize the NIC queue settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
+            nic = interactive_choose_nic()
+            set_nic_and_disks = interactive_ask_service('Do you want to enable Network Interface Card (NIC) and disk(s) optimization?', 'Yes - optimize the NIC queue and disks settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
    if sysconfig_setup:
-        setup_args = '--setup-nic' if set_nic else ''
+        setup_args = '--setup-nic-and-disks' if set_nic_and_disks else ''
        run_setup_script('NIC queue', '/usr/lib/scylla/scylla_sysconfig_setup --nic {nic} {setup_args}'.format(nic=nic, setup_args=setup_args))

    if interactive:
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -40,7 +40,7 @@ if __name__ == '__main__':
        cfg = sysconfig_parser('/etc/sysconfig/scylla-server')
    else:
        cfg = sysconfig_parser('/etc/default/scylla-server')
-    set_nic = str2bool(cfg.get('SET_NIC'))
+    set_nic_and_disks = str2bool(get_set_nic_and_disks_config_value(cfg))
    ami = str2bool(cfg.get('AMI'))

    parser = argparse.ArgumentParser(description='Setting parameters on Scylla sysconfig file.')
@@ -58,8 +58,8 @@ if __name__ == '__main__':
                        help='scylla home directory')
    parser.add_argument('--confdir',
                        help='scylla config directory')
-    parser.add_argument('--setup-nic', action='store_true', default=set_nic,
-                        help='setup NIC\'s interrupts, RPS, XPS')
+    parser.add_argument('--setup-nic-and-disks', action='store_true', default=set_nic_and_disks,
+                        help='setup NIC\'s and disks\' interrupts, RPS, XPS, nomerges and I/O scheduler')
    parser.add_argument('--ami', action='store_true', default=ami,
                        help='AMI instance mode')
    args = parser.parse_args()
@@ -71,8 +71,8 @@ if __name__ == '__main__':
    ifname = args.nic if args.nic else cfg.get('IFNAME')
    network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')

-    if args.setup_nic:
-        rps_cpus = out('/usr/lib/scylla/posix_net_conf.sh --cpu-mask {}'.format(ifname))
+    if args.setup_nic_and_disks:
+        rps_cpus = out('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname))
        if len(rps_cpus) > 0:
            cpuset = hex2list(rps_cpus)
            run('/usr/lib/scylla/scylla_cpuset_setup --cpuset {}'.format(cpuset))
@@ -104,8 +104,13 @@ if __name__ == '__main__':
        cfg.set('SCYLLA_HOME', args.homedir)
    if args.confdir:
        cfg.set('SCYLLA_CONF', args.confdir)
-    if str2bool(cfg.get('SET_NIC')) != args.setup_nic:
-        cfg.set('SET_NIC', bool2str(args.setup_nic))
+
+    if str2bool(get_set_nic_and_disks_config_value(cfg)) != args.setup_nic_and_disks:
+        if cfg.has_option('SET_NIC'):
+            cfg.set('SET_NIC', bool2str(args.setup_nic_and_disks))
+        else:
+            cfg.set('SET_NIC_AND_DISKS', bool2str(args.setup_nic_and_disks))
+
    if str2bool(cfg.get('AMI')) != args.ami:
        cfg.set('AMI', bool2str(args.ami))
    cfg.commit()
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -28,6 +28,7 @@ import time
 import urllib.error
 import urllib.parse
 import urllib.request
+import yaml


 def curl(url, byte=False):
@@ -384,6 +385,37 @@ def get_mode_cpuset(nic, mode):
    except subprocess.CalledProcessError:
        return '-1'

+def get_scylla_dirs():
+    """
+    Returns a list of scylla directories configured in /etc/scylla/scylla.yaml.
+    Verifies that mandatory parameters are set.
+    """
+    scylla_yaml_name = '/etc/scylla/scylla.yaml'
+    y = yaml.load(open(scylla_yaml_name))
+
+    # Check that mandatory fields are set
+    if 'data_file_directories' not in y or \
+            not y['data_file_directories'] or \
+            not len(y['data_file_directories']) or \
+            not " ".join(y['data_file_directories']).strip():
+        raise Exception("{}: at least one directory has to be set in 'data_file_directory'".format(scylla_yaml_name))
+    if 'commitlog_directory' not in y or not y['commitlog_directory']:
+        raise Exception("{}: 'commitlog_directory' has to be set".format(scylla_yaml_name))
+
+    dirs = []
+    dirs.extend(y['data_file_directories'])
+    dirs.append(y['commitlog_directory'])
+
+    if 'hints_directory' in y and y['hints_directory']:
+        dirs.append(y['hints_directory'])
+    if 'view_hints_directory' in y and y['view_hints_directory']:
+        dirs.append(y['view_hints_directory'])
+
+    return [d for d in dirs if d is not None]
+
+def perftune_base_command():
+    disk_tune_param = "--tune disks " + " ".join("--dir {}".format(d) for d in get_scylla_dirs())
+    return '/usr/lib/scylla/perftune.py {}'.format(disk_tune_param)

 def get_cur_cpuset():
    cfg = sysconfig_parser('/etc/scylla.d/cpuset.conf')
@@ -419,6 +451,25 @@ def create_perftune_conf(nic='eth0'):
 def is_valid_nic(nic):
    return os.path.exists('/sys/class/net/{}'.format(nic))

+# Remove this when we do not support SET_NIC configuration value anymore
+def get_set_nic_and_disks_config_value(cfg):
+    """
+    Get the SET_NIC_AND_DISKS configuration value.
+    Return the SET_NIC configuration value if SET_NIC_AND_DISKS is not found (old releases case).
+    :param cfg: sysconfig_parser object
+    :return configuration value
+    :except If the configuration value is not found
+    """
+
+    # Sanity check
+    if cfg.has_option('SET_NIC_AND_DISKS') and cfg.has_option('SET_NIC'):
+        raise Exception("Only one of 'SET_NIC_AND_DISKS' and 'SET_NIC' is allowed to be present")
+
+    try:
+        return cfg.get('SET_NIC_AND_DISKS')
+    except:
+        # For backwards compatibility
+        return cfg.get('SET_NIC')

 class SystemdException(Exception):
    pass
@@ -483,8 +534,11 @@ class sysconfig_parser:
    def get(self, key):
        return self._cfg.get('global', key).strip('"')

+    def has_option(self, key):
+        return self._cfg.has_option('global', key)
+
    def set(self, key, val):
-        if not self._cfg.has_option('global', key):
+        if not self.has_option(key):
            return self.__add(key, val)
        self._data = re.sub('^{}=[^\n]*$'.format(key), '{}="{}"'.format(key, self.__escape(val)), self._data, flags=re.MULTILINE)
        self.__load()
--- a/dist/common/sysconfig/scylla-server
+++ b/dist/common/sysconfig/scylla-server
@@ -10,8 +10,8 @@ BRIDGE=virbr0
 # ethernet device name
 IFNAME=eth0

-# setup NIC's interrupts, RPS, XPS (posix)
-SET_NIC=no
+# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
+SET_NIC_AND_DISKS=no

 # ethernet device driver (dpdk)
 ETHDRV=
--- a/dist/common/sysctl.d/99-scylla-aio.conf
+++ b/dist/common/sysctl.d/99-scylla-aio.conf
@@ -0,0 +1,2 @@
+# Raise max AIO events
+fs.aio-max-nr = 1048576
--- a/dist/common/systemd/node-exporter.service
+++ b/dist/common/systemd/node-exporter.service
@@ -5,7 +5,7 @@ Description=Node Exporter
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/bin/node_exporter -collectors.enabled interrupts,conntrack,diskstats,entropy,filefd,filesystem,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat
+ExecStart=/usr/bin/node_exporter  --collector.interrupts

 [Install]
 WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-housekeeping-restart.service.mustache
+++ b/dist/common/systemd/scylla-housekeeping-restart.service.mustache
@@ -6,7 +6,12 @@ After=network.target
 Type=simple
 User=scylla
 Group=scylla
+{{#debian}}
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/apt/sources.list.d/scylla*.list' version --mode r
+{{/debian}}
+{{#redhat}}
 ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/yum.repos.d/scylla*.repo' version --mode r
+{{/redhat}}

 [Install]
 WantedBy=multi-user.target
--- a/dist/debian/debian/scylla-kernel-conf.install
+++ b/dist/debian/debian/scylla-kernel-conf.install
@@ -1 +1,2 @@
 dist/common/sysctl.d/99-scylla-sched.conf /etc/sysctl.d
+dist/common/sysctl.d/99-scylla-aio.conf /etc/sysctl.d
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -9,6 +9,7 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
 else
    # expect failures in virtualized environments
    sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
+    sysctl -p/etc/sysctl.d/99-scylla-aio.conf || :
 fi

 #DEBHELPER#
--- a/dist/debian/debian/scylla-server.dirs
+++ b/dist/debian/debian/scylla-server.dirs
@@ -4,5 +4,6 @@ var/lib/scylla
 var/lib/scylla/data
 var/lib/scylla/commitlog
 var/lib/scylla/hints
+var/lib/scylla/view_hints
 var/lib/scylla/coredump
 var/lib/scylla-housekeeping
--- a/dist/debian/rules.mustache
+++ b/dist/debian/rules.mustache
@@ -4,7 +4,7 @@ export PYBUILD_DISABLE=1
 jobs := $(shell echo $$DEB_BUILD_OPTIONS | sed -r "s/.*parallel=([0-9]+).*/-j\1/")

 override_dh_auto_configure:
-	./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"
+	./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --c-compiler=/opt/scylladb/bin/gcc-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"

 override_dh_auto_build:
 	PATH="/opt/scylladb/bin:$$PATH" ninja $(jobs)
--- a/dist/debian/scylla-server.install.mustache
+++ b/dist/debian/scylla-server.install.mustache
@@ -1,7 +1,6 @@
 dist/common/limits.d/scylla.conf etc/security/limits.d
 dist/common/scylla.d/*.conf etc/scylla.d
 seastar/dpdk/usertools/dpdk-devbind.py usr/lib/scylla
-seastar/scripts/posix_net_conf.sh usr/lib/scylla
 seastar/scripts/perftune.py usr/lib/scylla
 dist/common/scripts/* usr/lib/scylla
 scylla-housekeeping usr/lib/scylla
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -26,7 +26,7 @@ ADD commandlineparser.py /commandlineparser.py
 ADD docker-entrypoint.py /docker-entrypoint.py

 # Install Scylla:
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo && \
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-3.0.repo -o /etc/yum.repos.d/scylla.repo && \
    yum -y install epel-release && \
    yum -y clean expire-cache && \
    yum -y update && \
--- a/dist/docker/redhat/etc/sysconfig/scylla-server
+++ b/dist/docker/redhat/etc/sysconfig/scylla-server
@@ -10,8 +10,8 @@ BRIDGE=virbr0
 # ethernet device name
 IFNAME=eth0

-# setup NIC's interrupts, RPS, XPS (posix)
-SET_NIC=no
+# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
+SET_NIC_AND_DISKS=no

 # ethernet device driver (dpdk)
 ETHDRV=
--- a/dist/offline_installer/redhat/build_offline_installer.sh
+++ b/dist/offline_installer/redhat/build_offline_installer.sh
@@ -91,7 +91,27 @@ mkdir -p build/offline_installer
 cp dist/offline_installer/redhat/header build/offline_installer
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve scylla
 # XXX: resolve option doesn't fetch some dependencies, need to manually fetch them
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sudo.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntp.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libedit.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntpdate.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve net-tools.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve kernel
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve grubby.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve linux-firmware
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve initscripts.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iproute.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iptables.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnfnetlink.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnetfilter_conntrack.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libmnl.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sysvinit-tools.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve yajl.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve mdadm.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libreport-filesystem.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve xfsprogs.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve PyYAML.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libyaml.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libjpeg-turbo.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libaio.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve snappy.x86_64
--- a/dist/redhat/build_rpm.sh
+++ b/dist/redhat/build_rpm.sh
@@ -108,11 +108,11 @@ fix_ownership() {
 if [ $JOBS -gt 0 ]; then
    RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
 fi
-sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/$PRODUCT-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
+sudo mock --rootdir=`pwd`/build/mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/$PRODUCT-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
 fix_ownership build/srpms
 if [[ "$TARGET" =~ ^epel-7- ]]; then
    TARGET=scylla-$TARGET
    RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
 fi
-sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/$PRODUCT-$VERSION*.src.rpm
+sudo mock --rootdir=`pwd`/build/mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/$PRODUCT-$VERSION*.src.rpm
 fix_ownership build/rpms
--- a/dist/redhat/scylla.spec.mustache
+++ b/dist/redhat/scylla.spec.mustache
@@ -56,7 +56,7 @@ License:        AGPLv3
 URL:            http://www.scylladb.com/
 BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler systemtap-sdt-devel ninja-build cmake python ragel grep kernel-headers
 %{?fedora:BuildRequires: boost-devel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum python2-pystache}
-%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python34 scylla-gcc73-c++, scylla-python34-pyparsing20 yaml-cpp-static pystache python-setuptools}
+%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-libatomic73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python34 scylla-gcc73-c++, scylla-python34-pyparsing20 yaml-cpp-static pystache python-setuptools}
 Requires:       {{product}}-conf systemd-libs hwloc PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils python3-pyudev mdadm xfsprogs
 %{?rhel:Requires: python34 python34-PyYAML kernel >= 3.10.0-514}
 %{?fedora:Requires: python3 python3-PyYAML}
@@ -97,7 +97,7 @@ cflags="--cflags=${defines[*]}"
 %endif
 %if 0%{?rhel}
 . /etc/profile.d/scylla.sh
-python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
+python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --c-compiler=/opt/scylladb/bin/gcc-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
 %endif
 ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune

@@ -193,7 +193,6 @@ rm -rf $RPM_BUILD_ROOT
 %{_prefix}/lib/scylla/scylla_cpuscaling_setup
 %{_prefix}/lib/scylla/scylla_fstrim
 %{_prefix}/lib/scylla/scylla_fstrim_setup
-%{_prefix}/lib/scylla/posix_net_conf.sh
 %{_prefix}/lib/scylla/perftune.py
 %{_prefix}/lib/scylla/dpdk-devbind.py
 %{_prefix}/lib/scylla/hex2list.py
@@ -209,6 +208,7 @@ rm -rf $RPM_BUILD_ROOT
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/data
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/commitlog
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/hints
+%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/view_hints
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/coredump
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla-housekeeping
 %ghost /etc/systemd/system/scylla-server.service.d/
@@ -283,6 +283,7 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
+/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :

 %files kernel-conf
 %defattr(-,root,root)
--- a/docs/docker-hub.md
+++ b/docs/docker-hub.md
@@ -66,7 +66,7 @@ You can use Docker volumes to improve performance of Scylla.
 Create a Scylla data directory ``/var/lib/scylla`` on the host, which is used by Scylla container to store all data:

 ```console
-$ sudo mkdir -p /var/lib/scylla/data /var/lib/scylla/commitlog /var/lib/scylla/hints
+$ sudo mkdir -p /var/lib/scylla/data /var/lib/scylla/commitlog /var/lib/scylla/hints /var/lib/scylla/view_hints
 ```

 Launch Scylla using Docker's ``--volume`` command line option to mount the created host directory as a data volume in the container and disable Scylla's developer mode to run I/O tuning before starting up the Scylla node.
--- a/encoding_stats.hh
+++ b/encoding_stats.hh
@@ -41,12 +41,11 @@ struct encoding_stats {
    //        int DELETION_TIME_EPOCH = (int)(c.getTimeInMillis() / 1000); // local deletion times are in seconds
    // Encoding stats are used for delta-encoding, so we want some default values
    // that are just good enough so we take some recent date in the past
-    static constexpr uint32_t deletion_time_epoch = 1442880000;
+    static constexpr int32_t deletion_time_epoch = 1442880000;
    static constexpr api::timestamp_type timestamp_epoch = api::timestamp_type(deletion_time_epoch) * 1000 * 1000;
-    static constexpr uint32_t ttl_epoch = 0;
+    static constexpr int32_t ttl_epoch = 0;

    api::timestamp_type min_timestamp = timestamp_epoch;
-    uint32_t min_local_deletion_time = deletion_time_epoch;
-    uint32_t min_ttl = ttl_epoch;
+    int32_t min_local_deletion_time = deletion_time_epoch;
+    int32_t min_ttl = ttl_epoch;
 };
-
--- a/frozen_mutation.hh
+++ b/frozen_mutation.hh
@@ -78,6 +78,11 @@ public:

 frozen_mutation freeze(const mutation& m);

+struct frozen_mutation_and_schema {
+    frozen_mutation fm;
+    schema_ptr s;
+};
+
 // Can receive streamed_mutation in reversed order.
 class streamed_mutation_freezer {
    const schema& _schema;
--- a/gms/application_state.cc
+++ b/gms/application_state.cc
@@ -63,6 +63,8 @@ static const std::map<application_state, sstring> application_state_names = {
    {application_state::SUPPORTED_FEATURES,     "SUPPORTED_FEATURES"},
    {application_state::CACHE_HITRATES,         "CACHE_HITRATES"},
    {application_state::SCHEMA_TABLES_VERSION,  "SCHEMA_TABLES_VERSION"},
+    {application_state::RPC_READY,              "RPC_READY"},
+    {application_state::VIEW_BACKLOG,           "VIEW_BACKLOG"},
 };

 std::ostream& operator<<(std::ostream& os, const application_state& m) {
--- a/gms/application_state.hh
+++ b/gms/application_state.hh
@@ -60,9 +60,9 @@ enum class application_state {
    SUPPORTED_FEATURES,
    CACHE_HITRATES,
    SCHEMA_TABLES_VERSION,
+    RPC_READY,
+    VIEW_BACKLOG,
    // pad to allow adding new states to existing cluster
-    X4,
-    X5,
    X6,
    X7,
    X8,
--- a/gms/endpoint_state.cc
+++ b/gms/endpoint_state.cc
@@ -61,4 +61,16 @@ std::ostream& operator<<(std::ostream& os, const endpoint_state& x) {
    return os;
 }

+bool endpoint_state::is_cql_ready() const {
+    auto* app_state = get_application_state_ptr(application_state::RPC_READY);
+    if (!app_state) {
+        return false;
+    }
+    try {
+        return boost::lexical_cast<int>(app_state->value);
+    } catch (...) {
+        return false;
+    }
+}
+
 }
--- a/gms/endpoint_state.hh
+++ b/gms/endpoint_state.hh
@@ -129,26 +129,8 @@ public:
        update_is_normal();
    }

-    void apply_application_state(application_state key, versioned_value&& value) {
-        auto&& e = _application_state[key];
-        if (e.version < value.version) {
-            e = std::move(value);
-        }
-        update_is_normal();
-    }
-
-    void apply_application_state(application_state key, const versioned_value& value) {
-        auto&& e = _application_state[key];
-        if (e.version < value.version) {
-            e = value;
-        }
-        update_is_normal();
-    }
-
-    void apply_application_state(const endpoint_state& es) {
-        for (auto&& e : es._application_state) {
-            apply_application_state(e.first, e.second);
-        }
+    void add_application_state(const endpoint_state& es) {
+        _application_state = es._application_state;
        update_is_normal();
    }

@@ -208,6 +190,8 @@ public:
        _is_normal = get_status() == sstring(versioned_value::STATUS_NORMAL);
    }

+    bool is_cql_ready() const;
+
    friend std::ostream& operator<<(std::ostream& os, const endpoint_state& x);
 };

--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -930,7 +930,7 @@ void gossiper::make_random_gossip_digest(utils::chunked_vector<gossip_digest>& g
 future<> gossiper::replicate(inet_address ep, const endpoint_state& es) {
    return container().invoke_on_all([ep, es, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].apply_application_state(es);
+            g.endpoint_state_map[ep].add_application_state(es);
        }
    });
 }
@@ -939,7 +939,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
    return container().invoke_on_all([ep, &src, &changed, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
            for (auto&& key : changed) {
-                g.endpoint_state_map[ep].apply_application_state(key, src.at(key));
+                g.endpoint_state_map[ep].add_application_state(key, src.at(key));
            }
        }
    });
@@ -948,7 +948,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
 future<> gossiper::replicate(inet_address ep, application_state key, const versioned_value& value) {
    return container().invoke_on_all([ep, key, &value, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].apply_application_state(key, value);
+            g.endpoint_state_map[ep].add_application_state(key, value);
        }
    });
 }
@@ -1175,11 +1175,13 @@ stdx::optional<endpoint_state> gossiper::get_endpoint_state_for_endpoint(inet_ad
    }
 }

-void gossiper::reset_endpoint_state_map() {
-    endpoint_state_map.clear();
+future<> gossiper::reset_endpoint_state_map() {
    _unreachable_endpoints.clear();
    _live_endpoints.clear();
    _live_endpoints_just_added.clear();
+    return container().invoke_on_all([] (gossiper& g) {
+        g.endpoint_state_map.clear();
+    });
 }

 std::unordered_map<inet_address, endpoint_state>& gms::gossiper::get_endpoint_states() {
@@ -1191,6 +1193,25 @@ bool gossiper::uses_host_id(inet_address endpoint) {
            get_application_state_ptr(endpoint, application_state::NET_VERSION);
 }

+bool gossiper::is_cql_ready(const inet_address& endpoint) const {
+    // Note:
+    // - New scylla node always send application_state::RPC_READY = false when
+    // the node boots and send application_state::RPC_READY = true when cql
+    // server is up
+    // - Old scylla node that does not support the application_state::RPC_READY
+    // never has application_state::RPC_READY in the endpoint_state, we can
+    // only think their cql server is up, so we return true here if
+    // application_state::RPC_READY is not present
+    auto* eps = get_endpoint_state_for_endpoint_ptr(endpoint);
+    if (!eps) {
+        logger.debug("Node {} does not have RPC_READY application_state, return is_cql_ready=true", endpoint);
+        return true;
+    }
+    auto ready = eps->is_cql_ready();
+    logger.debug("Node {}: is_cql_ready={}",  endpoint, ready);
+    return ready;
+}
+
 utils::UUID gossiper::get_host_id(inet_address endpoint) {
    if (!uses_host_id(endpoint)) {
        throw std::runtime_error(sprint("Host %s does not use new-style tokens!", endpoint));
@@ -1298,6 +1319,14 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
 // Runs inside seastar::async context
 void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as alive {}", addr);
+
+    // Do not mark a node with status shutdown as UP.
+    auto status = get_gossip_status(local_state);
+    if (status == sstring(versioned_value::SHUTDOWN)) {
+        logger.warn("Skip marking node {} with status = {} as UP", addr, status);
+        return;
+    }
+
    local_state.mark_alive();
    local_state.update_timestamp(); // prevents do_status_check from racing us and evicting if it was down > A_VERY_LONG_TIME

@@ -1319,7 +1348,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    }

    if (!_in_shadow_round) {
-        logger.info("InetAddress {} is now UP, status = {}", addr, get_gossip_status(local_state));
+        logger.info("InetAddress {} is now UP, status = {}", addr, status);
    }

    _subscribers.for_each([addr, local_state] (auto& subscriber) {
@@ -1662,6 +1691,7 @@ void gossiper::maybe_initialize_local_state(int generation_nbr) {
    }
 }

+// Runs inside seastar::async context
 void gossiper::add_saved_endpoint(inet_address ep) {
    if (ep == get_broadcast_address()) {
        logger.debug("Attempt to add self as saved endpoint");
@@ -1687,6 +1717,7 @@ void gossiper::add_saved_endpoint(inet_address ep) {
    }
    ep_state.mark_dead();
    endpoint_state_map[ep] = ep_state;
+    replicate(ep, ep_state).get();
    _unreachable_endpoints[ep] = now();
    logger.trace("Adding saved endpoint {} {}", ep, ep_state.get_heart_beat_state().get_generation());
 }
@@ -1924,6 +1955,7 @@ void gossiper::mark_as_shutdown(const inet_address& endpoint) {
        auto& ep_state = *es;
        ep_state.add_application_state(application_state::STATUS, storage_service_value_factory().shutdown(true));
        ep_state.get_heart_beat_state().force_highest_possible_version_unsafe();
+        replicate(endpoint, ep_state).get();
        mark_dead(endpoint, ep_state);
        get_local_failure_detector().force_conviction(endpoint);
    }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -417,7 +417,7 @@ public:
    stdx::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep) const;

    // removes ALL endpoint states; should only be called after shadow gossip
-    void reset_endpoint_state_map();
+    future<> reset_endpoint_state_map();

    std::unordered_map<inet_address, endpoint_state>& get_endpoint_states();

@@ -548,6 +548,7 @@ public:
    bool is_seed(const inet_address& endpoint) const;
    bool is_shutdown(const inet_address& endpoint) const;
    bool is_normal(const inet_address& endpoint) const;
+    bool is_cql_ready(const inet_address& endpoint) const;
    bool is_silent_shutdown_state(const endpoint_state& ep_state) const;
    void mark_as_shutdown(const inet_address& endpoint);
    void force_newer_generation();
--- a/gms/versioned_value.hh
+++ b/gms/versioned_value.hh
@@ -246,6 +246,9 @@ public:
            return versioned_value(hitrates);
        }

+        versioned_value cql_ready(bool value) {
+            return versioned_value(to_sstring(int(value)));
+        }
    };
 }; // class versioned_value

--- a/idl/streaming.idl.hh
+++ b/idl/streaming.idl.hh
@@ -42,4 +42,13 @@ class prepare_message {
    uint32_t dst_cpu_id;
 };

+enum class stream_reason : uint8_t {
+    unspecified,
+    bootstrap,
+    decommission,
+    removenode,
+    rebuild,
+    repair,
+};
+
 }
--- a/idl/view.idl.hh
+++ b/idl/view.idl.hh
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+namespace db {
+namespace view {
+class update_backlog {
+    size_t current;
+    size_t max;
+};
+}
+}
--- a/install.sh
+++ b/install.sh
@@ -93,7 +93,6 @@ install -m644 build/*.service -Dt "$rprefix"/lib/systemd/system
 install -m644 dist/common/systemd/*.service -Dt "$rprefix"/lib/systemd/system
 install -m644 dist/common/systemd/*.timer -Dt "$rprefix"/lib/systemd/system
 install -m755 dist/common/scripts/* -Dt "$rprefix"/lib/scylla/
-install -m755 seastar/scripts/posix_net_conf.sh "$rprefix"/lib/scylla/
 install -m755 seastar/scripts/perftune.py -Dt "$rprefix"/lib/scylla/
 install -m755 seastar/dpdk/usertools/dpdk-devbind.py -Dt "$rprefix"/lib/scylla/
 install -m755 build/release/scylla -Dt "$rprefix/bin"
@@ -116,6 +115,7 @@ install -m755 -d "$root"/var/lib/scylla/
 install -m755 -d "$root"/var/lib/scylla/data
 install -m755 -d "$root"/var/lib/scylla/commitlog
 install -m755 -d "$root"/var/lib/scylla/hints
+install -m755 -d "$root"/var/lib/scylla/view_hints
 install -m755 -d "$root"/var/lib/scylla/coredump
 install -m755 -d "$root"/var/lib/scylla-housekeeping
 install -m755 -d "$rprefix"/lib/scylla/swagger-ui
--- a/1
+++ b/1
--- a/licenses/libdeflate-license.txt
+++ b/licenses/libdeflate-license.txt
@@ -0,0 +1,21 @@
+Copyright 2016 Eric Biggers
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation files
+(the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -119,9 +119,17 @@ insert_token_range_to_sorted_container_while_unwrapping(
        const dht::token& tok,
        dht::token_range_vector& ret) {
    if (prev_tok < tok) {
-        ret.emplace_back(
-                dht::token_range::bound(prev_tok, false),
-                dht::token_range::bound(tok, true));
+        auto pos = ret.end();
+        if (!ret.empty() && !std::prev(pos)->end()) {
+            // We inserted a wrapped range (a, b] previously as
+            // (-inf, b], (a, +inf). So now we insert in the next-to-last
+            // position to keep the last range (a, +inf) at the end.
+            pos = std::prev(pos);
+        }
+        ret.insert(pos,
+                dht::token_range{
+                        dht::token_range::bound(prev_tok, false),
+                        dht::token_range::bound(tok, true)});
    } else {
        ret.emplace_back(
                dht::token_range::bound(prev_tok, false),
--- a/main.cc
+++ b/main.cc
@@ -33,6 +33,7 @@
 #include "service/storage_service.hh"
 #include "service/migration_manager.hh"
 #include "service/load_broadcaster.hh"
+#include "service/view_update_backlog_broker.hh"
 #include "streaming/stream_session.hh"
 #include "db/system_keyspace.hh"
 #include "db/system_distributed_keyspace.hh"
@@ -62,6 +63,7 @@
 #include "service/cache_hitrate_calculator.hh"
 #include "sstables/compaction_manager.hh"
 #include "sstables/sstables.hh"
+#include <db/view/view_update_from_staging_generator.hh>

 seastar::metrics::metric_groups app_metrics;

@@ -548,7 +550,7 @@ int main(int ac, char** av) {
                    directories.insert(std::move(shard_dir));
                }
            }
-            boost::filesystem::path view_pending_updates_base_dir = boost::filesystem::path(db.local().get_config().data_file_directories()[0]) / "view_pending_updates";
+            boost::filesystem::path view_pending_updates_base_dir = boost::filesystem::path(db.local().get_config().view_hints_directory());
            sstring view_pending_updates_base_dir_str = view_pending_updates_base_dir.native();
            dirs.touch_and_lock(view_pending_updates_base_dir_str).get();
            directories.insert(view_pending_updates_base_dir_str);
@@ -618,7 +620,8 @@ int main(int ac, char** av) {
            service::storage_proxy::config spcfg;
            spcfg.hinted_handoff_enabled = hinted_handoff_enabled;
            spcfg.available_memory = memory::stats().total_memory();
-            proxy.start(std::ref(db), spcfg).get();
+            static db::view::node_update_backlog node_backlog(smp::count, 10ms);
+            proxy.start(std::ref(db), spcfg, std::ref(node_backlog)).get();
            // #293 - do not stop anything
            // engine().at_exit([&proxy] { return proxy.stop(); });
            supervisor::notify("starting migration manager");
@@ -647,6 +650,21 @@ int main(int ac, char** av) {

            supervisor::notify("loading sstables");
            distributed_loader::init_non_system_keyspaces(db, proxy).get();
+
+            static sharded<db::view::view_update_from_staging_generator> view_update_from_staging_generator;
+            view_update_from_staging_generator.start(std::ref(db), std::ref(proxy)).get();
+            supervisor::notify("discovering staging sstables");
+            db.invoke_on_all([] (database& db) {
+                for (auto& x : db.get_column_families()) {
+                    table& t = *(x.second);
+                    for (sstables::shared_sstable sst : *t.get_sstables()) {
+                        if (sst->is_staging()) {
+                            view_update_from_staging_generator.local().register_staging_sstable(std::move(sst), t.shared_from_this());
+                        }
+                    }
+                }
+            }).get();
+
            // register connection drop notification to update cf's cache hit rate data
            db.invoke_on_all([] (database& db) {
                db.register_connection_drop_notifier(netw::get_local_messaging_service());
@@ -669,6 +687,11 @@ int main(int ac, char** av) {
                    cl->delete_segments(std::move(paths));
                }
            }
+
+            db.invoke_on_all([&proxy] (database& db) {
+                db.get_compaction_manager().start();
+            }).get();
+
            // If the same sstable is shared by several shards, it cannot be
            // deleted until all shards decide to compact it. So we want to
            // start these compactions now. Note we start compacting only after
@@ -700,9 +723,21 @@ int main(int ac, char** av) {
            proxy.invoke_on_all([] (service::storage_proxy& p) {
                p.init_messaging_service();
            }).get();
+
            supervisor::notify("starting streaming service");
-            streaming::stream_session::init_streaming_service(db).get();
+            streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_from_staging_generator).get();
            api::set_server_stream_manager(ctx).get();
+
+            supervisor::notify("starting hinted handoff manager");
+            if (hinted_handoff_enabled) {
+                db::hints::manager::rebalance(cfg->hints_directory()).get();
+            }
+            db::hints::manager::rebalance(cfg->view_hints_directory()).get();
+
+            proxy.invoke_on_all([] (service::storage_proxy& local_proxy) {
+                local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), service::get_local_storage_service().shared_from_this());
+            }).get();
+
            supervisor::notify("starting messaging service");
            // Start handling REPAIR_CHECKSUM_RANGE messages
            netw::get_messaging_service().invoke_on_all([&db] (auto& ms) {
@@ -735,20 +770,29 @@ int main(int ac, char** av) {
            cf_cache_hitrate_calculator.start(std::ref(db), std::ref(cf_cache_hitrate_calculator)).get();
            engine().at_exit([&cf_cache_hitrate_calculator] { return cf_cache_hitrate_calculator.stop(); });
            cf_cache_hitrate_calculator.local().run_on(engine().cpu_id());
+
+            supervisor::notify("starting view update backlog broker");
+            static sharded<service::view_update_backlog_broker> view_backlog_broker;
+            view_backlog_broker.start(std::ref(proxy), std::ref(gms::get_gossiper())).get();
+            view_backlog_broker.invoke_on_all(&service::view_update_backlog_broker::start).get();
+            engine().at_exit([] {
+                return view_backlog_broker.stop();
+            });
+
            api::set_server_cache(ctx);
            gms::get_local_gossiper().wait_for_gossip_to_settle().get();
            api::set_server_gossip_settle(ctx).get();

-            supervisor::notify("starting hinted handoff manager");
-            if (hinted_handoff_enabled) {
-                db::hints::manager::rebalance(cfg->hints_directory()).get();
-            }
-            db::hints::manager::rebalance(cfg->data_file_directories()[0] + "/view_pending_updates").get();
-
+            supervisor::notify("allow replaying hints");
            proxy.invoke_on_all([] (service::storage_proxy& local_proxy) {
-                local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), service::get_local_storage_service().shared_from_this());
+                local_proxy.allow_replaying_hints();
            }).get();

+            if (cfg->view_building()) {
+                supervisor::notify("Launching generate_mv_updates for non system tables");
+                view_update_from_staging_generator.invoke_on_all(&db::view::view_update_from_staging_generator::start).get();
+            }
+
            static sharded<db::view::view_builder> view_builder;
            if (cfg->view_building()) {
                supervisor::notify("starting the view builder");
@@ -786,6 +830,11 @@ int main(int ac, char** av) {
            engine().at_exit([] {
                return repair_shutdown(service::get_local_storage_service().db());
            });
+
+            engine().at_exit([] {
+                return view_update_from_staging_generator.stop();
+            });
+
            engine().at_exit([] {
                return service::get_local_storage_service().drain_on_shutdown();
            });
--- a/memtable.hh
+++ b/memtable.hh
@@ -145,8 +145,8 @@ private:
    class encoding_stats_collector {
    private:
        min_max_tracker<api::timestamp_type> timestamp;
-        min_tracker<uint32_t> min_local_deletion_time;
-        min_tracker<uint32_t> min_ttl;
+        min_tracker<int32_t> min_local_deletion_time;
+        min_tracker<int32_t> min_ttl;

        void update_timestamp(api::timestamp_type ts) {
            if (ts != api::missing_timestamp) {
@@ -214,7 +214,9 @@ private:

        void update(const schema& s, const deletable_row& dr) {
            update(dr.marker());
-            update(dr.deleted_at().tomb());
+            row_tombstone row_tomb = dr.deleted_at();
+            update(row_tomb.regular());
+            update(row_tomb.tomb());
            update(s, dr.cells(), column_kind::regular_column);
        }

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -34,6 +34,7 @@
 #include "rpc/rpc.hh"
 #include "db/config.hh"
 #include "db/system_keyspace.hh"
+#include "db/view/view_update_backlog.hh"
 #include "dht/i_partitioner.hh"
 #include "range.hh"
 #include "frozen_schema.hh"
@@ -56,6 +57,7 @@
 #include "idl/partition_checksum.dist.hh"
 #include "idl/query.dist.hh"
 #include "idl/cache_temperature.dist.hh"
+#include "idl/view.dist.hh"
 #include "serializer_impl.hh"
 #include "serialization_visitors.hh"
 #include "idl/consistency_level.dist.impl.hh"
@@ -77,6 +79,7 @@
 #include "idl/cache_temperature.dist.impl.hh"
 #include "rpc/lz4_compressor.hh"
 #include "rpc/multi_algo_compressor_factory.hh"
+#include "idl/view.dist.impl.hh"
 #include "partition_range_compat.hh"
 #include "stdx.hh"
 #include <boost/range/adaptor/filtered.hpp>
@@ -135,12 +138,14 @@ struct messaging_service::rpc_protocol_wrapper : public rpc_protocol { using rpc
 // This should be integrated into messaging_service proper.
 class messaging_service::rpc_protocol_client_wrapper {
    std::unique_ptr<rpc_protocol::client> _p;
+    ::shared_ptr<seastar::tls::server_credentials> _credentials;
 public:
    rpc_protocol_client_wrapper(rpc_protocol& proto, rpc::client_options opts, ipv4_addr addr, ipv4_addr local = ipv4_addr())
            : _p(std::make_unique<rpc_protocol::client>(proto, std::move(opts), addr, local)) {
    }
    rpc_protocol_client_wrapper(rpc_protocol& proto, rpc::client_options opts, ipv4_addr addr, ipv4_addr local, ::shared_ptr<seastar::tls::server_credentials> c)
            : _p(std::make_unique<rpc_protocol::client>(proto, std::move(opts), seastar::tls::socket(c), addr, local))
+            , _credentials(c)
    {}
    auto get_stats() const { return _p->get_stats(); }
    future<> stop() { return _p->stop(); }
@@ -148,6 +153,19 @@ public:
        return _p->error();
    }
    operator rpc_protocol::client&() { return *_p; }
+
+    /**
+     * #3787 Must ensure we use the right type of socker. I.e. tls or not.
+     * See above, we retain credentials object so we here can know if we
+     * are tls or not.
+     */
+    template<typename Serializer, typename... Out>
+    future<rpc::sink<Out...>> make_stream_sink() {
+        if (_credentials) {
+            return _p->make_stream_sink<Serializer, Out...>(seastar::tls::socket(_credentials));
+        }
+        return _p->make_stream_sink<Serializer, Out...>();
+    }
 };

 struct messaging_service::rpc_protocol_server_wrapper : public rpc_protocol::server { using rpc_protocol::server::server; };
@@ -638,17 +656,18 @@ rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rp
 }

 future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>
-messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, msg_addr id) {
-    rpc_protocol::client& rpc_client = *get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
-    return rpc_client.make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
-        auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
-        return rpc_handler(rpc_client , plan_id, schema_id, cf_id, estimated_partitions, sink).then([sink] (rpc::source<int32_t> source) mutable {
+messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id) {
+    auto wrapper = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
+    rpc_protocol::client& rpc_client = *wrapper;
+    return wrapper->make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
+        auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
+        return rpc_handler(rpc_client , plan_id, schema_id, cf_id, estimated_partitions, reason, sink).then([sink] (rpc::source<int32_t> source) mutable {
            return make_ready_future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>(std::move(sink), std::move(source));
        });
    });
 }

-void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::source<frozen_mutation_fragment> source)>&& func) {
+void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment> source)>&& func) {
    register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
 }

@@ -726,13 +745,13 @@ auto send_message_oneway_timeout(messaging_service* ms, Timeout timeout, messagi

 // PREPARE_MESSAGE
 void messaging_service::register_prepare_message(std::function<future<streaming::prepare_message> (const rpc::client_info& cinfo,
-        streaming::prepare_message msg, UUID plan_id, sstring description)>&& func) {
+        streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func) {
    register_handler(this, messaging_verb::PREPARE_MESSAGE, std::move(func));
 }
 future<streaming::prepare_message> messaging_service::send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
-        sstring description) {
+        sstring description, streaming::stream_reason reason) {
    return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
-        std::move(msg), plan_id, std::move(description));
+        std::move(msg), plan_id, std::move(description), reason);
 }

 // PREPARE_DONE_MESSAGE
@@ -745,12 +764,12 @@ future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id,
 }

 // STREAM_MUTATION
-void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented)>&& func) {
+void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented, rpc::optional<streaming::stream_reason> reason)>&& func) {
    register_handler(this, messaging_verb::STREAM_MUTATION, std::move(func));
 }
-future<> messaging_service::send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented) {
+future<> messaging_service::send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented, streaming::stream_reason reason) {
    return send_message<void>(this, messaging_verb::STREAM_MUTATION, id,
-        plan_id, std::move(fm), dst_cpu_id, fragmented);
+        plan_id, std::move(fm), dst_cpu_id, fragmented, reason);
 }

 // STREAM_MUTATION_DONE
@@ -873,24 +892,24 @@ future<> messaging_service::send_counter_mutation(msg_addr id, clock_type::time_
    return send_message_timeout<void>(this, messaging_verb::COUNTER_MUTATION, std::move(id), timeout, std::move(fms), cl, std::move(trace_info));
 }

-void messaging_service::register_mutation_done(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id)>&& func) {
+void messaging_service::register_mutation_done(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, rpc::optional<db::view::update_backlog> backlog)>&& func) {
    register_handler(this, netw::messaging_verb::MUTATION_DONE, std::move(func));
 }
 void messaging_service::unregister_mutation_done() {
    _rpc->unregister_handler(netw::messaging_verb::MUTATION_DONE);
 }
-future<> messaging_service::send_mutation_done(msg_addr id, unsigned shard, response_id_type response_id) {
-    return send_message_oneway(this, messaging_verb::MUTATION_DONE, std::move(id), std::move(shard), std::move(response_id));
+future<> messaging_service::send_mutation_done(msg_addr id, unsigned shard, response_id_type response_id, db::view::update_backlog backlog) {
+    return send_message_oneway(this, messaging_verb::MUTATION_DONE, std::move(id), std::move(shard), std::move(response_id), std::move(backlog));
 }

-void messaging_service::register_mutation_failed(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, size_t num_failed)>&& func) {
+void messaging_service::register_mutation_failed(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog)>&& func) {
    register_handler(this, netw::messaging_verb::MUTATION_FAILED, std::move(func));
 }
 void messaging_service::unregister_mutation_failed() {
    _rpc->unregister_handler(netw::messaging_verb::MUTATION_FAILED);
 }
-future<> messaging_service::send_mutation_failed(msg_addr id, unsigned shard, response_id_type response_id, size_t num_failed) {
-    return send_message_oneway(this, messaging_verb::MUTATION_FAILED, std::move(id), std::move(shard), std::move(response_id), num_failed);
+future<> messaging_service::send_mutation_failed(msg_addr id, unsigned shard, response_id_type response_id, size_t num_failed, db::view::update_backlog backlog) {
+    return send_message_oneway(this, messaging_verb::MUTATION_FAILED, std::move(id), std::move(shard), std::move(response_id), num_failed, std::move(backlog));
 }

 void messaging_service::register_read_data(std::function<future<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature> (const rpc::client_info&, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda)>&& func) {
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -35,6 +35,7 @@
 #include "repair/repair.hh"
 #include "tracing/tracing.hh"
 #include "digest_algorithm.hh"
+#include "streaming/stream_reason.hh"

 #include <seastar/net/tls.hh>

@@ -57,6 +58,10 @@ namespace db {
 class seed_provider_type;
 }

+namespace db::view {
+class update_backlog;
+}
+
 class frozen_mutation;
 class frozen_schema;
 class partition_checksum;
@@ -237,23 +242,23 @@ public:

    // Wrapper for PREPARE_MESSAGE verb
    void register_prepare_message(std::function<future<streaming::prepare_message> (const rpc::client_info& cinfo,
-            streaming::prepare_message msg, UUID plan_id, sstring description)>&& func);
+            streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func);
    future<streaming::prepare_message> send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
-            sstring description);
+            sstring description, streaming::stream_reason);

    // Wrapper for PREPARE_DONE_MESSAGE verb
    void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
    future<> send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);

    // Wrapper for STREAM_MUTATION verb
-    void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>)>&& func);
-    future<> send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented);
+    void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>, rpc::optional<streaming::stream_reason>)>&& func);
+    future<> send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented, streaming::stream_reason reason);

    // Wrapper for STREAM_MUTATION_FRAGMENTS
    // The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
-    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::source<frozen_mutation_fragment> source)>&& func);
+    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment> source)>&& func);
    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source);
-    future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, msg_addr id);
+    future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);

    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
@@ -316,14 +321,14 @@ public:
    future<> send_counter_mutation(msg_addr id, clock_type::time_point timeout, std::vector<frozen_mutation> fms, db::consistency_level cl, stdx::optional<tracing::trace_info> trace_info = std::experimental::nullopt);

    // Wrapper for MUTATION_DONE
-    void register_mutation_done(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id)>&& func);
+    void register_mutation_done(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, rpc::optional<db::view::update_backlog> backlog)>&& func);
    void unregister_mutation_done();
-    future<> send_mutation_done(msg_addr id, unsigned shard, response_id_type response_id);
+    future<> send_mutation_done(msg_addr id, unsigned shard, response_id_type response_id, db::view::update_backlog backlog);

    // Wrapper for MUTATION_FAILED
-    void register_mutation_failed(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, size_t num_failed)>&& func);
+    void register_mutation_failed(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog)>&& func);
    void unregister_mutation_failed();
-    future<> send_mutation_failed(msg_addr id, unsigned shard, response_id_type response_id, size_t num_failed);
+    future<> send_mutation_failed(msg_addr id, unsigned shard, response_id_type response_id, size_t num_failed, db::view::update_backlog backlog);

    // Wrapper for READ_DATA
    // Note: WTH is future<foreign_ptr<lw_shared_ptr<query::result>>
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -25,6 +25,8 @@

 #include <boost/range/adaptor/reversed.hpp>

+#include <fmt/ostream.h>
+
 logging::logger mmq_log("multishard_mutation_query");

 template <typename T>
@@ -60,7 +62,7 @@ using foreign_unique_ptr = foreign_ptr<std::unique_ptr<T>>;
 /// 3) Both, `read_context::lookup_readers()` and `read_context::save_readers()`
 ///    knows to do nothing when the query is not stateful and just short
 ///    circuit.
-class read_context {
+class read_context : public reader_lifecycle_policy {
    struct reader_params {
        std::unique_ptr<const dht::partition_range> range;
        std::unique_ptr<const query::partition_slice> slice;
@@ -79,6 +81,21 @@ class read_context {
        foreign_unique_ptr<reader_params> params;
        foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
        foreign_unique_ptr<flat_mutation_reader> reader;
+        reader_concurrency_semaphore* semaphore;
+    };
+    struct paused_reader {
+        shard_id shard;
+        reader_concurrency_semaphore::inactive_read_handle handle;
+        bool has_pending_next_partition;
+    };
+    struct inactive_read : public reader_concurrency_semaphore::inactive_read {
+        foreign_unique_ptr<flat_mutation_reader> reader;
+        explicit inactive_read(foreign_unique_ptr<flat_mutation_reader> reader)
+            : reader(std::move(reader)) {
+        }
+        virtual void evict() override {
+            reader.reset();
+        }
    };

    using inexistent_state = std::monostate;
@@ -94,66 +111,72 @@ class read_context {
    struct dismantling_state {
        foreign_unique_ptr<reader_params> params;
        foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
-        future<stopped_foreign_reader> reader_fut;
+        std::variant<foreign_unique_ptr<flat_mutation_reader>, paused_reader> reader;
        circular_buffer<mutation_fragment> buffer;
    };
    struct ready_to_save_state {
        foreign_unique_ptr<reader_params> params;
        foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
-        foreign_unique_ptr<flat_mutation_reader> reader;
+        std::variant<foreign_unique_ptr<flat_mutation_reader>, paused_reader> reader;
        circular_buffer<mutation_fragment> buffer;
    };
-    struct future_used_state {
-        future<used_state> fut;
+    struct paused_state {
+        foreign_unique_ptr<reader_params> params;
+        foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
+        reader_concurrency_semaphore::inactive_read_handle handle;
    };
-    struct future_dismantling_state {
-        future<dismantling_state> fut;
+    struct evicted_state {
    };

-    //                           ( )
+    //              ( )    (O)
+    //               |      ^
+    //               |      |
+    //         +--- inexistent ---+
+    //         |                  |
+    //     (1) |              (3) |    (3)
+    //         |                  |  +------ evicted -> (O)
+    //  successful_lookup         |  |          ^
+    //     |         |            |  |  (7)     |
+    //     |         |            |  +-------+  | (8)
+    //     |         |    (4)     |  |       |  |
+    //     |         +----------> used      paused
+    //     |                      |  |  (6)  ^  |
+    // (2) |                      |  +-------+  |
+    //     |                  (5) |             | (5)
+    //     |                      |             |
+    //     |                      |             |
+    //     |                 dismantling <------+
+    //     |                      |
+    //     |                  (2) |
+    //     |                      |
+    //     +---------------> ready_to_save
    //                            |
-    //            +------ inexistent_state -----+
-    //            |                             |
-    //        (1) |                         (6) |
-    //            |                             |
-    //  successful_lookup_state         future_used_state
-    //     |              |               |           |
-    // (2) |          (3) |           (7) |       (8) |
-    //     |              |               |           |
-    //     |         used_state <---------+  future_dismantling_state
-    //     |              |                           |
-    //     |          (4) |                       (9) |
-    //     |              |                           |
-    //     |      dismantling_state <-----------------+
-    //     |              |
-    //     |          (5) |
-    //     |              |
-    //     +----> ready_to_save_state
-    //                    |
-    //                   (O)
+    //                           (O)
    //
    //  1) lookup_readers()
    //  2) save_readers()
-    //  3) make_remote_reader()
-    //  4) dismantle_reader()
-    //  5) prepare_reader_for_saving()
-    //  6) do_make_remote_reader()
-    //  7) reader is created
-    //  8) dismantle_reader()
-    //  9) reader is created
+    //  3) do_make_remote_reader()
+    //  4) make_remote_reader()
+    //  5) dismantle_reader()
+    //  6) pause_reader()
+    //  7) try_resume() - success
+    //  8) try_resume() - failure
    using reader_state = std::variant<
        inexistent_state,
        successful_lookup_state,
        used_state,
+        paused_state,
+        evicted_state,
        dismantling_state,
-        ready_to_save_state,
-        future_used_state,
-        future_dismantling_state>;
+        ready_to_save_state>;

    struct dismantle_buffer_stats {
        size_t partitions = 0;
        size_t fragments = 0;
        size_t bytes = 0;
+        size_t discarded_partitions = 0;
+        size_t discarded_fragments = 0;
+        size_t discarded_bytes = 0;

        void add(const schema& s, const mutation_fragment& mf) {
            partitions += unsigned(mf.is_partition_start());
@@ -173,6 +196,35 @@ class read_context {
            ++fragments;
            bytes += ps.memory_usage(s);
        }
+        void add_discarded(const schema& s, const mutation_fragment& mf) {
+            discarded_partitions += unsigned(mf.is_partition_start());
+            ++discarded_fragments;
+            discarded_bytes += mf.memory_usage(s);
+        }
+        void add_discarded(const schema& s, const range_tombstone& rt) {
+            ++discarded_fragments;
+            discarded_bytes += rt.memory_usage(s);
+        }
+        void add_discarded(const schema& s, const static_row& sr) {
+            ++discarded_fragments;
+            discarded_bytes += sr.memory_usage(s);
+        }
+        void add_discarded(const schema& s, const partition_start& ps) {
+            ++discarded_partitions;
+            ++discarded_fragments;
+            discarded_bytes += ps.memory_usage(s);
+        }
+        friend std::ostream& operator<<(std::ostream& os, const dismantle_buffer_stats& s) {
+            os << format(
+                    "kept {} partitions/{} fragments/{} bytes, discarded {} partitions/{} fragments/{} bytes",
+                    s.partitions,
+                    s.fragments,
+                    s.bytes,
+                    s.discarded_partitions,
+                    s.discarded_fragments,
+                    s.discarded_bytes);
+            return os;
+        }
    };

    distributed<database>& _db;
@@ -183,6 +235,15 @@ class read_context {

    // One for each shard. Index is shard id.
    std::vector<reader_state> _readers;
+    std::vector<reader_concurrency_semaphore*> _semaphores;
+
+    gate _dismantling_gate;
+
+    reader_concurrency_semaphore& semaphore() {
+        return *_semaphores[engine().cpu_id()];
+    }
+
+    static std::string_view reader_state_to_string(const reader_state& rs);

    static future<bundled_remote_reader> do_make_remote_reader(
            distributed<database>& db,
@@ -200,13 +261,10 @@ class read_context {
            const query::partition_slice& ps,
            const io_priority_class& pc,
            tracing::trace_state_ptr trace_state,
-            streamed_mutation::forwarding fwd_sm,
            mutation_reader::forwarding fwd_mr);

-    void dismantle_reader(shard_id shard, future<stopped_foreign_reader>&& stopped_reader_fut);
+    void dismantle_reader(shard_id shard, future<paused_or_stopped_reader>&& reader_fut);

-    ready_to_save_state* prepare_reader_for_saving(dismantling_state& current_state, future<stopped_foreign_reader>&& stopped_reader_fut,
-            const dht::decorated_key& last_pkey, const std::optional<clustering_key_prefix>& last_ckey);
    dismantle_buffer_stats dismantle_combined_buffer(circular_buffer<mutation_fragment> combined_buffer, const dht::decorated_key& pkey);
    dismantle_buffer_stats dismantle_compaction_state(detached_compaction_state compaction_state);
    future<> save_reader(ready_to_save_state& current_state, const dht::decorated_key& last_pkey,
@@ -221,6 +279,7 @@ public:
            , _ranges(ranges)
            , _trace_state(std::move(trace_state)) {
        _readers.resize(smp::count);
+        _semaphores.resize(smp::count);
    }

    read_context(read_context&&) = delete;
@@ -229,26 +288,24 @@ public:
    read_context& operator=(read_context&&) = delete;
    read_context& operator=(const read_context&) = delete;

-    remote_reader_factory factory() {
-        return [this] (
-                shard_id shard,
-                schema_ptr schema,
-                const dht::partition_range& pr,
-                const query::partition_slice& ps,
-                const io_priority_class& pc,
-                tracing::trace_state_ptr trace_state,
-                streamed_mutation::forwarding fwd_sm,
-                mutation_reader::forwarding fwd_mr) {
-            return make_remote_reader(shard, std::move(schema), pr, ps, pc, std::move(trace_state), fwd_sm, fwd_mr);
-        };
+    virtual future<foreign_unique_ptr<flat_mutation_reader>> create_reader(
+            shard_id shard,
+            schema_ptr schema,
+            const dht::partition_range& pr,
+            const query::partition_slice& ps,
+            const io_priority_class& pc,
+            tracing::trace_state_ptr trace_state,
+            mutation_reader::forwarding fwd_mr) override {
+        return make_remote_reader(shard, std::move(schema), pr, ps, pc, std::move(trace_state), fwd_mr);
    }

-    foreign_reader_dismantler dismantler() {
-        return [this] (shard_id shard, future<stopped_foreign_reader>&& stopped_reader_fut) {
-            dismantle_reader(shard, std::move(stopped_reader_fut));
-        };
+    virtual void destroy_reader(shard_id shard, future<paused_or_stopped_reader> reader_fut) noexcept override {
+        dismantle_reader(shard, std::move(reader_fut));
    }

+    virtual future<> pause(foreign_unique_ptr<flat_mutation_reader> reader) override;
+    virtual future<foreign_unique_ptr<flat_mutation_reader>> try_resume(shard_id shard) override;
+
    future<> lookup_readers();

    future<> save_readers(circular_buffer<mutation_fragment> unconsumed_buffer, detached_compaction_state compaction_state,
@@ -257,6 +314,28 @@ public:
    future<> stop();
 };

+// Deliberatly not using the `reader_state` alias here, so that we can enlist
+// the help of the compiler to keep this up-to-date.
+std::string_view read_context::reader_state_to_string(const std::variant<
+        inexistent_state,
+        successful_lookup_state,
+        used_state,
+        paused_state,
+        evicted_state,
+        dismantling_state,
+        ready_to_save_state>& rs) {
+    static const std::array<const char*, 7> reader_state_names{
+        "inexistent",
+        "successful_lookup",
+        "used",
+        "paused",
+        "evicted",
+        "dismantling",
+        "ready_to_save",
+    };
+    return reader_state_names.at(rs.index());
+}
+
 future<read_context::bundled_remote_reader> read_context::do_make_remote_reader(
        distributed<database>& db,
        shard_id shard,
@@ -278,7 +357,8 @@ future<read_context::bundled_remote_reader> read_context::do_make_remote_reader(
        return make_ready_future<bundled_remote_reader>(bundled_remote_reader{
                make_foreign(std::make_unique<reader_params>(std::move(params))),
                make_foreign(std::make_unique<utils::phased_barrier::operation>(std::move(read_operation))),
-                make_foreign(std::make_unique<flat_mutation_reader>(std::move(reader)))});
+                make_foreign(std::make_unique<flat_mutation_reader>(std::move(reader))),
+                &table.read_concurrency_semaphore()});
    });
 }

@@ -289,117 +369,95 @@ future<foreign_unique_ptr<flat_mutation_reader>> read_context::make_remote_reade
        const query::partition_slice& ps,
        const io_priority_class& pc,
        tracing::trace_state_ptr trace_state,
-        streamed_mutation::forwarding,
        mutation_reader::forwarding) {
    auto& rs = _readers[shard];

-    if (!std::holds_alternative<successful_lookup_state>(rs) && !std::holds_alternative<inexistent_state>(rs)) {
-        mmq_log.warn("Unexpected request to create reader for shard {}. A reader for this shard was already created.", shard);
-        throw std::logic_error(sprint("Unexpected request to create reader for shard {}."
-                    " A reader for this shard was already created in the context of this read.", shard));
+    if (!std::holds_alternative<successful_lookup_state>(rs) && !std::holds_alternative<inexistent_state>(rs)
+            && !std::holds_alternative<evicted_state>(rs)) {
+        auto msg = format("Unexpected request to create reader for shard {}."
+                " The reader is expected to be in either `successful_lookup`, `inexistent` or `evicted` state,"
+                " but is in `{}` state instead.", shard, reader_state_to_string(rs));
+        mmq_log.warn(msg.c_str());
+        throw std::logic_error(msg.c_str());
    }

-    // The reader is either in inexistent or successful lookup state.
+    // The reader is either in inexistent, evicted or successful lookup state.
    if (auto current_state = std::get_if<successful_lookup_state>(&rs)) {
        auto reader = std::move(current_state->reader);
        rs = used_state{std::move(current_state->params), std::move(current_state->read_operation)};
        return make_ready_future<foreign_unique_ptr<flat_mutation_reader>>(std::move(reader));
    }

-    auto created = promise<used_state>();
-    rs = future_used_state{created.get_future()};
-    return do_make_remote_reader(_db, shard, std::move(schema), pr, ps, pc, std::move(trace_state)).then_wrapped([this, &rs,
-            created = std::move(created)] (future<bundled_remote_reader>&& bundled_reader_fut) mutable {
-        if (bundled_reader_fut.failed()) {
-            auto ex = bundled_reader_fut.get_exception();
-            if (!std::holds_alternative<future_used_state>(rs)) {
-                created.set_exception(ex);
-            }
-            return make_exception_future<foreign_unique_ptr<flat_mutation_reader>>(std::move(ex));
-        }
-
-        auto bundled_reader = bundled_reader_fut.get0();
-        auto new_state = used_state{std::move(bundled_reader.params), std::move(bundled_reader.read_operation)};
-        if (std::holds_alternative<future_used_state>(rs)) {
-            rs = std::move(new_state);
-        } else {
-            created.set_value(std::move(new_state));
-        }
+    return do_make_remote_reader(_db, shard, std::move(schema), pr, ps, pc, std::move(trace_state)).then(
+            [this, &rs, shard] (bundled_remote_reader&& bundled_reader) mutable {
+        _semaphores[shard] = bundled_reader.semaphore;
+        rs = used_state{std::move(bundled_reader.params), std::move(bundled_reader.read_operation)};
        return make_ready_future<foreign_unique_ptr<flat_mutation_reader>>(std::move(bundled_reader.reader));
    });
 }

-void read_context::dismantle_reader(shard_id shard, future<stopped_foreign_reader>&& stopped_reader_fut) {
-    auto& rs = _readers[shard];
+void read_context::dismantle_reader(shard_id shard, future<paused_or_stopped_reader>&& reader_fut) {
+    with_gate(_dismantling_gate, [this, shard, reader_fut = std::move(reader_fut)] () mutable {
+        return reader_fut.then_wrapped([this, shard] (future<paused_or_stopped_reader>&& reader_fut) {
+            auto& rs = _readers[shard];

-    if (auto* maybe_used_state = std::get_if<used_state>(&rs)) {
-        auto read_operation = std::move(maybe_used_state->read_operation);
-        auto params = std::move(maybe_used_state->params);
-        rs = dismantling_state{std::move(params), std::move(read_operation), std::move(stopped_reader_fut), circular_buffer<mutation_fragment>{}};
-    } else if (auto* maybe_future_used_state = std::get_if<future_used_state>(&rs)) {
-        auto f = maybe_future_used_state->fut.then([stopped_reader_fut = std::move(stopped_reader_fut)] (used_state&& current_state) mutable {
-            auto read_operation = std::move(current_state.read_operation);
-            auto params = std::move(current_state.params);
-            return dismantling_state{std::move(params), std::move(read_operation), std::move(stopped_reader_fut),
-                circular_buffer<mutation_fragment>{}};
+            if (reader_fut.failed()) {
+                mmq_log.debug("Failed to stop reader on shard {}: {}", shard, reader_fut.get_exception());
+                ++_db.local().get_stats().multishard_query_failed_reader_stops;
+                rs = inexistent_state{};
+                return;
+            }
+
+            auto reader = reader_fut.get0();
+            if (auto* maybe_used_state = std::get_if<used_state>(&rs)) {
+                auto read_operation = std::move(maybe_used_state->read_operation);
+                auto params = std::move(maybe_used_state->params);
+                rs = dismantling_state{std::move(params), std::move(read_operation), std::move(reader.remote_reader),
+                        std::move(reader.unconsumed_fragments)};
+            } else if (auto* maybe_paused_state = std::get_if<paused_state>(&rs)) {
+                auto read_operation = std::move(maybe_paused_state->read_operation);
+                auto params = std::move(maybe_paused_state->params);
+                auto handle = maybe_paused_state->handle;
+                rs = dismantling_state{std::move(params), std::move(read_operation), paused_reader{shard, handle, reader.has_pending_next_partition},
+                        std::move(reader.unconsumed_fragments)};
+            // Do nothing for evicted readers.
+            } else if (!std::holds_alternative<evicted_state>(rs)) {
+                mmq_log.warn(
+                        "Unexpected request to dismantle reader in state `{}` for shard {}."
+                        " Reader was not created nor is in the process of being created.",
+                        reader_state_to_string(rs),
+                        shard);
+            }
        });
-        rs = future_dismantling_state{std::move(f)};
-    } else {
-        mmq_log.warn("Unexpected request to dismantle reader for shard {}. Reader was not created nor is in the process of being created.", shard);
-    }
+    });
 }

 future<> read_context::stop() {
-    auto cleanup = [db = &_db.local()] (shard_id shard, dismantling_state state) {
-        return state.reader_fut.then_wrapped([db, shard, params = std::move(state.params),
-                read_operation = std::move(state.read_operation)] (future<stopped_foreign_reader>&& fut) mutable {
-            if (fut.failed()) {
-                mmq_log.debug("Failed to stop reader on shard {}: {}", shard, fut.get_exception());
-                ++db->get_stats().multishard_query_failed_reader_stops;
-            } else {
-                smp::submit_to(shard, [reader = fut.get0().remote_reader, params = std::move(params),
-                        read_operation = std::move(read_operation)] () mutable {
-                    reader.release();
+    auto pr = promise<>();
+    auto fut = pr.get_future();
+    auto gate_fut = _dismantling_gate.is_closed() ? make_ready_future<>() : _dismantling_gate.close();
+    gate_fut.then([this] {
+        for (shard_id shard = 0; shard != smp::count; ++shard) {
+            if (auto* maybe_dismantling_state = std::get_if<dismantling_state>(&_readers[shard])) {
+                _db.invoke_on(shard, [schema = global_schema_ptr(_schema), reader = std::move(maybe_dismantling_state->reader),
+                        params = std::move(maybe_dismantling_state->params),
+                        read_operation = std::move(maybe_dismantling_state->read_operation)] (database& db) mutable {
+                    if (auto* maybe_stopped_reader = std::get_if<foreign_unique_ptr<flat_mutation_reader>>(&reader)) {
+                        maybe_stopped_reader->release();
+                    } else {
+                        // We cannot use semaphore() here, as this can be already destroyed.
+                        auto& table = db.find_column_family(schema);
+                        table.read_concurrency_semaphore().unregister_inactive_read(std::get<paused_reader>(reader).handle);
+                    }
                    params.release();
                    read_operation.release();
                });
            }
-        });
-    };
-
-    std::vector<future<>> futures;
-    auto immediate_cleanup = size_t(0);
-    auto future_cleanup = size_t(0);
-
-    // Wait for pending read-aheads in the background.
-    for (shard_id shard = 0; shard != smp::count; ++shard) {
-        auto& rs = _readers[shard];
-
-        if (auto maybe_dismantling_state = std::get_if<dismantling_state>(&rs)) {
-            ++immediate_cleanup;
-            cleanup(shard, std::move(*maybe_dismantling_state));
-        } else if (auto maybe_future_dismantling_state = std::get_if<future_dismantling_state>(&rs)) {
-            ++future_cleanup;
-            futures.emplace_back(maybe_future_dismantling_state->fut.then_wrapped([=] (future<dismantling_state>&& current_state_fut) {
-                if (current_state_fut.failed()) {
-                    mmq_log.debug("Failed to stop reader on shard {}: {}", shard, current_state_fut.get_exception());
-                    ++_db.local().get_stats().multishard_query_failed_reader_stops;
-                } else {
-                    cleanup(shard, current_state_fut.get0());
-                }
-            }));
        }
-    }
-
-    if (const auto total = immediate_cleanup + future_cleanup) {
-        tracing::trace(_trace_state,
-                "Stopping {} shard readers, {} ready for immediate cleanup, {} will be cleaned up after finishes read-ahead",
-                total,
-                immediate_cleanup,
-                future_cleanup);
-    }
-
-    return when_all(futures.begin(), futures.end()).discard_result();
+    }).finally([pr = std::move(pr)] () mutable {
+        pr.set_value();
+    });
+    return fut;
 }

 read_context::dismantle_buffer_stats read_context::dismantle_combined_buffer(circular_buffer<mutation_fragment> combined_buffer,
@@ -414,7 +472,21 @@ read_context::dismantle_buffer_stats read_context::dismantle_combined_buffer(cir
    for (;rit != rend; ++rit) {
        if (rit->is_partition_start()) {
            const auto shard = partitioner.shard_of(rit->as_partition_start().key().token());
-            auto& shard_buffer = std::get<dismantling_state>(_readers[shard]).buffer;
+            auto maybe_dismantling_state = std::get_if<dismantling_state>(&_readers[shard]);
+
+            // It is possible that the reader this partition originates from
+            // does not exist anymore. Either because we failed stopping it or
+            // because it was evicted.
+            if (!maybe_dismantling_state) {
+                for (auto& smf : tmp_buffer) {
+                    stats.add_discarded(*_schema, smf);
+                }
+                stats.add_discarded(*_schema, *rit);
+                tmp_buffer.clear();
+                continue;
+            }
+
+            auto& shard_buffer = maybe_dismantling_state->buffer;
            for (auto& smf : tmp_buffer) {
                stats.add(*_schema, smf);
                shard_buffer.emplace_front(std::move(smf));
@@ -438,10 +510,26 @@ read_context::dismantle_buffer_stats read_context::dismantle_combined_buffer(cir
 }

 read_context::dismantle_buffer_stats read_context::dismantle_compaction_state(detached_compaction_state compaction_state) {
+    auto stats = dismantle_buffer_stats();
    auto& partitioner = dht::global_partitioner();
    const auto shard = partitioner.shard_of(compaction_state.partition_start.key().token());
-    auto& shard_buffer = std::get<dismantling_state>(_readers[shard]).buffer;
-    auto stats = dismantle_buffer_stats();
+    auto maybe_dismantling_state = std::get_if<dismantling_state>(&_readers[shard]);
+
+    // It is possible that the reader this partition originates from does not
+    // exist anymore. Either because we failed stopping it or because it was
+    // evicted.
+    if (!maybe_dismantling_state) {
+        for (auto& rt : compaction_state.range_tombstones) {
+            stats.add_discarded(*_schema, rt);
+        }
+        if (compaction_state.static_row) {
+            stats.add_discarded(*_schema, *compaction_state.static_row);
+        }
+        stats.add_discarded(*_schema, compaction_state.partition_start);
+        return stats;
+    }
+
+    auto& shard_buffer = maybe_dismantling_state->buffer;

    for (auto& rt : compaction_state.range_tombstones | boost::adaptors::reversed) {
        stats.add(*_schema, rt);
@@ -459,46 +547,35 @@ read_context::dismantle_buffer_stats read_context::dismantle_compaction_state(de
    return stats;
 }

-read_context::ready_to_save_state* read_context::prepare_reader_for_saving(
-        dismantling_state& current_state,
-        future<stopped_foreign_reader>&& stopped_reader_fut,
-        const dht::decorated_key& last_pkey,
-        const std::optional<clustering_key_prefix>& last_ckey) {
-    const auto shard = current_state.params.get_owner_shard();
-    auto& rs = _readers[shard];
-
-    if (stopped_reader_fut.failed()) {
-        mmq_log.debug("Failed to stop reader on shard {}: {}", shard, stopped_reader_fut.get_exception());
-        ++_db.local().get_stats().multishard_query_failed_reader_stops;
-        return nullptr;
-    }
-
-    auto stopped_reader = stopped_reader_fut.get0();
-
-    // If the buffer is empty just overwrite it.
-    // If it has some data in it append the fragments to the back.
-    // The unconsumed fragments appended here come from the
-    // foreign_reader which is at the lowest layer, hence its
-    // fragments need to be at the back of the buffer.
-    if (current_state.buffer.empty()) {
-        current_state.buffer = std::move(stopped_reader.unconsumed_fragments);
-    } else {
-        std::move(stopped_reader.unconsumed_fragments.begin(), stopped_reader.unconsumed_fragments.end(), std::back_inserter(current_state.buffer));
-    }
-    rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation), std::move(stopped_reader.remote_reader),
-        std::move(current_state.buffer)};
-    return &std::get<ready_to_save_state>(rs);
-}
-
 future<> read_context::save_reader(ready_to_save_state& current_state, const dht::decorated_key& last_pkey,
        const std::optional<clustering_key_prefix>& last_ckey) {
-    const auto shard = current_state.reader.get_owner_shard();
-    return _db.invoke_on(shard, [shard, query_uuid = _cmd.query_uuid, query_ranges = _ranges, &current_state, &last_pkey, &last_ckey,
-            gts = tracing::global_trace_state_ptr(_trace_state)] (database& db) mutable {
+    auto* maybe_stopped_reader = std::get_if<foreign_unique_ptr<flat_mutation_reader>>(&current_state.reader);
+    const auto shard = maybe_stopped_reader
+        ? maybe_stopped_reader->get_owner_shard()
+        : std::get<paused_reader>(current_state.reader).shard;
+
+    return _db.invoke_on(shard, [this, shard, query_uuid = _cmd.query_uuid, query_ranges = _ranges, &current_state,
+            &last_pkey, &last_ckey, gts = tracing::global_trace_state_ptr(_trace_state)] (database& db) mutable {
        try {
            auto params = current_state.params.release();
            auto read_operation = current_state.read_operation.release();
-            auto reader = current_state.reader.release();
+
+            flat_mutation_reader_opt reader;
+            if (auto* maybe_paused_reader = std::get_if<paused_reader>(&current_state.reader)) {
+                if (auto inactive_read_ptr = semaphore().unregister_inactive_read(maybe_paused_reader->handle)) {
+                    reader = std::move(*static_cast<inactive_read&>(*inactive_read_ptr).reader);
+                    if (maybe_paused_reader->has_pending_next_partition) {
+                        reader->next_partition();
+                    }
+                }
+            } else {
+                reader = std::move(*std::get<foreign_unique_ptr<flat_mutation_reader>>(current_state.reader));
+            }
+
+            if (!reader) {
+                return;
+            }
+
            auto& buffer = current_state.buffer;
            const auto fragments = buffer.size();
            const auto size_before = reader->buffer_size();
@@ -541,6 +618,33 @@ future<> read_context::save_reader(ready_to_save_state& current_state, const dht
    });
 }

+future<> read_context::pause(foreign_unique_ptr<flat_mutation_reader> reader) {
+    const auto shard = reader.get_owner_shard();
+    return _db.invoke_on(shard, [this, reader = std::move(reader)] (database& db) mutable {
+        return semaphore().register_inactive_read(std::make_unique<inactive_read>(std::move(reader)));
+    }).then([this, shard] (reader_concurrency_semaphore::inactive_read_handle handle) {
+        auto& current_state = std::get<used_state>(_readers[shard]);
+        _readers[shard] = paused_state{std::move(current_state.params), std::move(current_state.read_operation), handle};
+    });
+}
+
+future<foreign_unique_ptr<flat_mutation_reader>> read_context::try_resume(shard_id shard) {
+    return _db.invoke_on(shard, [this, handle = std::get<paused_state>(_readers[shard]).handle] (database& db) mutable {
+        if (auto inactive_read_ptr = semaphore().unregister_inactive_read(handle)) {
+            return std::move(static_cast<inactive_read&>(*inactive_read_ptr).reader);
+        }
+        return foreign_unique_ptr<flat_mutation_reader>();
+    }).then([this, shard] (foreign_unique_ptr<flat_mutation_reader> reader) {
+        if (reader) {
+            auto& current_state = std::get<paused_state>(_readers[shard]);
+            _readers[shard] = used_state{std::move(current_state.params), std::move(current_state.read_operation)};
+        } else {
+            _readers[shard] = evicted_state{};
+        }
+        return std::move(reader);
+    });
+}
+
 future<> read_context::lookup_readers() {
    if (_cmd.query_uuid == utils::UUID{} || _cmd.is_first_page) {
        return make_ready_future<>();
@@ -549,21 +653,24 @@ future<> read_context::lookup_readers() {
    return parallel_for_each(boost::irange(0u, smp::count), [this] (shard_id shard) {
        return _db.invoke_on(shard,
                [shard, cmd = &_cmd, ranges = &_ranges, gs = global_schema_ptr(_schema), gts = tracing::global_trace_state_ptr(_trace_state)] (
-                        database& db) mutable -> reader_state {
+                        database& db) mutable -> future<reader_state, reader_concurrency_semaphore*> {
            auto schema = gs.get();
            auto querier_opt = db.get_querier_cache().lookup_shard_mutation_querier(cmd->query_uuid, *schema, *ranges, cmd->slice, gts.get());
+            auto& table = db.find_column_family(schema);
            if (!querier_opt) {
-                return inexistent_state{};
+                return make_ready_future<reader_state, reader_concurrency_semaphore*>(inexistent_state{}, &table.read_concurrency_semaphore());
            }

            auto& q = *querier_opt;
-            auto& table = db.find_column_family(schema);
            auto params = make_foreign(std::make_unique<reader_params>(std::move(q).reader_range(), std::move(q).reader_slice()));
            auto read_operation = make_foreign(std::make_unique<utils::phased_barrier::operation>(table.read_in_progress()));
            auto reader = make_foreign(std::make_unique<flat_mutation_reader>(std::move(q).reader()));
-            return successful_lookup_state{std::move(params), std::move(read_operation), std::move(reader)};
-        }).then([this, shard] (reader_state&& state) {
+            return make_ready_future<reader_state, reader_concurrency_semaphore*>(
+                    successful_lookup_state{std::move(params), std::move(read_operation), std::move(reader)},
+                    &table.read_concurrency_semaphore());
+        }).then([this, shard] (reader_state&& state, reader_concurrency_semaphore* sem) {
            _readers[shard] = std::move(state);
+            _semaphores[shard] = sem;
        });
    });
 }
@@ -574,49 +681,35 @@ future<> read_context::save_readers(circular_buffer<mutation_fragment> unconsume
        return make_ready_future<>();
    }

-    auto last_pkey = compaction_state.partition_start.key();
+    return _dismantling_gate.close().then([this, unconsumed_buffer = std::move(unconsumed_buffer), compaction_state = std::move(compaction_state),
+          last_ckey = std::move(last_ckey)] () mutable {
+        auto last_pkey = compaction_state.partition_start.key();

-    const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
-    tracing::trace(_trace_state, "Dismantled combined buffer: {} partitions/{} fragments/{} bytes", cb_stats.partitions, cb_stats.fragments,
-            cb_stats.bytes);
+        const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
+        tracing::trace(_trace_state, "Dismantled combined buffer: {}", cb_stats);

-    const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
-    tracing::trace(_trace_state, "Dismantled compaction state: {} partitions/{} fragments/{} bytes", cs_stats.partitions, cs_stats.fragments,
-            cs_stats.bytes);
+        const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
+        tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);

-    return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
+        return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
                const std::optional<clustering_key_prefix>& last_ckey) {
-        return parallel_for_each(_readers, [this, &last_pkey, &last_ckey] (reader_state& rs) {
-            if (auto* maybe_successful_lookup_state = std::get_if<successful_lookup_state>(&rs)) {
-                auto& current_state = *maybe_successful_lookup_state;
-                rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation),
-                        std::move(current_state.reader), circular_buffer<mutation_fragment>{}};
-                return save_reader(std::get<ready_to_save_state>(rs), last_pkey, last_ckey);
-            }
+            return parallel_for_each(_readers, [this, &last_pkey, &last_ckey] (reader_state& rs) {
+                if (auto* maybe_successful_lookup_state = std::get_if<successful_lookup_state>(&rs)) {
+                    auto& current_state = *maybe_successful_lookup_state;
+                    rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation),
+                            std::move(current_state.reader), circular_buffer<mutation_fragment>{}};
+                    return save_reader(std::get<ready_to_save_state>(rs), last_pkey, last_ckey);
+                }

-            auto finish_saving = [this, &last_pkey, &last_ckey] (dismantling_state& current_state) {
-                return current_state.reader_fut.then_wrapped([this, &current_state, &last_pkey, &last_ckey] (
-                            future<stopped_foreign_reader>&& stopped_reader_fut) mutable {
-                    if (auto* ready_state = prepare_reader_for_saving(current_state, std::move(stopped_reader_fut), last_pkey, last_ckey)) {
-                        return save_reader(*ready_state, last_pkey, last_ckey);
-                    }
-                    return make_ready_future<>();
-                });
-            };
+                if (auto* maybe_dismantling_state = std::get_if<dismantling_state>(&rs)) {
+                    auto& current_state = *maybe_dismantling_state;
+                    rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation),
+                            std::move(current_state.reader), std::move(current_state.buffer)};
+                    return save_reader(std::get<ready_to_save_state>(rs), last_pkey, last_ckey);
+                }

-            if (auto* maybe_dismantling_state = std::get_if<dismantling_state>(&rs)) {
-                return finish_saving(*maybe_dismantling_state);
-            }
-
-            if (auto* maybe_future_dismantling_state = std::get_if<future_dismantling_state>(&rs)) {
-                return maybe_future_dismantling_state->fut.then([this, &rs,
-                        finish_saving = std::move(finish_saving)] (dismantling_state&& next_state) mutable {
-                    rs = std::move(next_state);
-                    return finish_saving(std::get<dismantling_state>(rs));
-                });
-            }
-
-            return make_ready_future<>();
+                return make_ready_future<>();
+            });
        });
    });
 }
@@ -629,8 +722,8 @@ static future<reconcilable_result> do_query_mutations(
        tracing::trace_state_ptr trace_state,
        db::timeout_clock::time_point timeout,
        query::result_memory_accounter&& accounter) {
-    return do_with(std::make_unique<read_context>(db, s, cmd, ranges, trace_state), [s, &cmd, &ranges, trace_state, timeout,
-            accounter = std::move(accounter)] (std::unique_ptr<read_context>& ctx) mutable {
+    return do_with(seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state), [s, &cmd, &ranges, trace_state, timeout,
+            accounter = std::move(accounter)] (shared_ptr<read_context>& ctx) mutable {
        return ctx->lookup_readers().then([&ctx, s = std::move(s), &cmd, &ranges, trace_state, timeout,
                accounter = std::move(accounter)] () mutable {
            auto ms = mutation_source([&] (schema_ptr s,
@@ -638,10 +731,9 @@ static future<reconcilable_result> do_query_mutations(
                    const query::partition_slice& ps,
                    const io_priority_class& pc,
                    tracing::trace_state_ptr trace_state,
-                    streamed_mutation::forwarding fwd_sm,
+                    streamed_mutation::forwarding,
                    mutation_reader::forwarding fwd_mr) {
-                return make_multishard_combining_reader(std::move(s), pr, ps, pc, dht::global_partitioner(), ctx->factory(), std::move(trace_state),
-                        fwd_sm, fwd_mr, ctx->dismantler());
+                return make_multishard_combining_reader(ctx, dht::global_partitioner(), std::move(s), pr, ps, pc, std::move(trace_state), fwd_mr);
            });
            auto reader = make_flat_multi_range_reader(s, std::move(ms), ranges, cmd.slice, service::get_local_sstable_query_read_priority(),
                    trace_state, mutation_reader::forwarding::no);
--- a/Show More
+++ b/Show More