sstable/compaction: Use correct schema in the writing consumer

Introduced in 2a437ab427. regular_compaction::select_sstable_writer() creates the sstable writer when the first partition is consumed from the combined mutation fragment stream. It gets the schema directly from the table object. That may be a different schema than the one used by the readers if there was a concurrent schema alter duringthat small time window. As a result, the writing consumer attached to readers will interpret fragments using the wrong version of the schema. One effect of this is storing values of some columns under a different column. This patch replaces all column_family::schema() accesses with accesses to the _schema memeber which is obtained once per compaction and is the same schema which readers use. Fixes #4304. Tests: - manual tests with hard-coded schema change injection to reproduce the bug - build/dev/scylla boot - tests/sstable_mutation_test Message-Id: <1551698056-23386-1-git-send-email-tgrabiec@scylladb.com> (cherry picked from commit 58e7ad20eb)
Merge "Fix commitlog chunks overwriting each other" from Paweł
2019-03-04 18:16:43 +02:00 · 2019-03-04 17:58:46 +02:00 · 2019-03-04 10:14:33 +02:00 · 2019-02-27 22:17:44 +02:00 · 2019-02-25 23:22:09 +02:00 · 2019-02-24 15:45:32 +02:00
350 changed files with 12007 additions and 5362 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
@@ -9,3 +9,6 @@
 [submodule "xxHash"]
 	path = xxHash
 	url = ../xxHash
+[submodule "libdeflate"]
+	path = libdeflate
+	url = ../libdeflate
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,4 +138,5 @@ target_include_directories(scylla PUBLIC
        ${SEASTAR_INCLUDE_DIRS}
        ${Boost_INCLUDE_DIRS}
        xxhash
+        libdeflate
        build/release/gen)
--- a/README.md
+++ b/README.md
@@ -50,12 +50,12 @@ Then, to build an RPM, run:
 ./dist/redhat/build_rpm.sh
 ```

-The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
+The built RPM is stored in the ``build/mock/<configuration>/result`` directory.
 For example, on Fedora 21 mock reports the following:

 ```
 INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
-INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
+INFO: Results and/or logs in: build/mock/fedora-21-x86_64/result
 ```

 ## Building Fedora-based Docker image
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=3.0.4

 if test -f version
 then
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -2228,11 +2228,11 @@
               "description":"The column family"
            },
            "total":{
-               "type":"int",
+               "type":"long",
               "description":"The total snapshot size"
            },
            "live":{
-               "type":"int",
+               "type":"long",
               "description":"The live snapshot size"
            }
         }
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -87,11 +87,17 @@ future<> create_metadata_table_if_missing(
    return mm.announce_new_column_family(b.build(), false);
 }

-future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db) {
+future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
    static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };

-    return do_until([&db] { return db.get_version() != database::empty_version; }, pause).then([&mm] {
-        return do_until([&mm] { return mm.have_schema_agreement(); }, pause);
+    return do_until([&db, &as] {
+        as.check();
+        return db.get_version() != database::empty_version;
+    }, pause).then([&mm, &as] {
+        return do_until([&mm, &as] {
+            as.check();
+            return mm.have_schema_agreement();
+        }, pause);
    });
 }

--- a/auth/common.hh
+++ b/auth/common.hh
@@ -81,7 +81,7 @@ future<> create_metadata_table_if_missing(
        stdx::string_view cql,
        ::service::migration_manager&);

-future<> wait_for_schema_agreement(::service::migration_manager&, const database&);
+future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);

 ///
 /// Time-outs for internal, non-local CQL queries.
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -160,7 +160,7 @@ future<> default_authorizer::start() {
                _migration_manager).then([this] {
            _finished = do_after_system_ready(_as, [this] {
                return async([this] {
-                    wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                    if (legacy_metadata_exists()) {
                        if (!any_granted().get0()) {
@@ -178,7 +178,7 @@ future<> default_authorizer::start() {

 future<> default_authorizer::stop() {
    _as.request_abort();
-    return _finished.handle_exception_type([](const sleep_aborted&) {});
+    return _finished.handle_exception_type([](const sleep_aborted&) {}).handle_exception_type([](const abort_requested_exception&) {});
 }

 future<permission_set>
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -157,7 +157,7 @@ future<> password_authenticator::start() {

         _stopped = do_after_system_ready(_as, [this] {
             return async([this] {
-                 wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                 wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                 if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
                     if (legacy_metadata_exists()) {
@@ -182,7 +182,7 @@ future<> password_authenticator::start() {

 future<> password_authenticator::stop() {
    _as.request_abort();
-    return _stopped.handle_exception_type([] (const sleep_aborted&) { });
+    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
 }

 db::consistency_level password_authenticator::consistency_for_user(stdx::string_view role_name) {
@@ -241,7 +241,11 @@ future<authenticated_user> password_authenticator::authenticate(
    }).then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
        try {
            auto res = f.get0();
-            if (res->empty() || !passwords::check(password, res->one().get_as<sstring>(SALTED_HASH))) {
+            auto salted_hash = std::experimental::optional<sstring>();
+            if (!res->empty()) {
+                salted_hash = res->one().get_opt<sstring>(SALTED_HASH);
+            }
+            if (!salted_hash || !passwords::check(password, *salted_hash)) {
                throw exceptions::authentication_exception("Username and/or password are incorrect");
            }
            return make_ready_future<authenticated_user>(username);
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -196,6 +196,10 @@ future<> service::start() {
 }

 future<> service::stop() {
+    // Only one of the shards has the listener registered, but let's try to
+    // unregister on each one just to make sure.
+    _migration_manager.unregister_listener(_migration_listener.get());
+
    return _permissions_cache->stop().then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
    });
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -227,7 +227,7 @@ future<> standard_role_manager::start() {
        return this->create_metadata_tables_if_missing().then([this] {
            _stopped = auth::do_after_system_ready(_as, [this] {
                return seastar::async([this] {
-                    wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                    if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
                        if (this->legacy_metadata_exists()) {
@@ -251,7 +251,7 @@ future<> standard_role_manager::start() {

 future<> standard_role_manager::stop() {
    _as.request_abort();
-    return _stopped.handle_exception_type([] (const sleep_aborted&) { });
+    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});;
 }

 future<> standard_role_manager::create_or_replace(stdx::string_view role_name, const role_config& c) const {
--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -77,7 +77,7 @@ protected:
        , _io_priority(iop)
        , _interval(interval)
        , _update_timer([this] { adjust(); })
-        , _control_points({{0,0}})
+        , _control_points()
        , _current_backlog(std::move(backlog))
        , _inflight_update(make_ready_future<>())
    {
@@ -125,7 +125,7 @@ public:
    flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
    flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
        : backlog_controller(sg, iop, std::move(interval),
-          std::vector<backlog_controller::control_point>({{soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
+          std::vector<backlog_controller::control_point>({{0.0, 0.0}, {soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
          std::move(current_dirty)
        )
    {}
@@ -139,7 +139,7 @@ public:
    compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
    compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, std::function<float()> current_backlog)
        : backlog_controller(sg, iop, std::move(interval),
-          std::vector<backlog_controller::control_point>({{0.5, 10}, {1.5, 100} , {normalization_factor, 1000}}),
+          std::vector<backlog_controller::control_point>({{0.0, 50}, {1.5, 100} , {normalization_factor, 1000}}),
          std::move(current_backlog)
        )
    {}
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -57,12 +57,12 @@ private:
        value_type data[0];
        void operator delete(void* ptr) { free(ptr); }
    };
-    // FIXME: consider increasing chunk size as the buffer grows
-    static constexpr size_type chunk_size{512};
+    static constexpr size_type default_chunk_size{512};
 private:
    std::unique_ptr<chunk> _begin;
    chunk* _current;
    size_type _size;
+    size_type _initial_chunk_size = default_chunk_size;
 public:
    class fragment_iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
        chunk* _current = nullptr;
@@ -102,13 +102,13 @@ private:
    }
    // Figure out next chunk size.
    //   - must be enough for data_size
-    //   - must be at least chunk_size
+    //   - must be at least _initial_chunk_size
    //   - try to double each time to prevent too many allocations
    //   - do not exceed max_chunk_size
    size_type next_alloc_size(size_t data_size) const {
        auto next_size = _current
                ? _current->size * 2
-                : chunk_size;
+                : _initial_chunk_size;
        next_size = std::min(next_size, max_chunk_size());
        // FIXME: check for overflow?
        return std::max<size_type>(next_size, data_size + sizeof(chunk));
@@ -116,13 +116,19 @@ private:
    // Makes room for a contiguous region of given size.
    // The region is accounted for as already written.
    // size must not be zero.
+    [[gnu::always_inline]]
    value_type* alloc(size_type size) {
-        if (size <= current_space_left()) {
+        if (__builtin_expect(size <= current_space_left(), true)) {
            auto ret = _current->data + _current->offset;
            _current->offset += size;
            _size += size;
            return ret;
        } else {
+            return alloc_new(size);
+        }
+    }
+    [[gnu::noinline]]
+    value_type* alloc_new(size_type size) {
            auto alloc_size = next_alloc_size(size);
            auto space = malloc(alloc_size);
            if (!space) {
@@ -140,19 +146,22 @@ private:
            }
            _size += size;
            return _current->data;
-        };
    }
 public:
-    bytes_ostream() noexcept
+    explicit bytes_ostream(size_t initial_chunk_size) noexcept
        : _begin()
        , _current(nullptr)
        , _size(0)
+        , _initial_chunk_size(initial_chunk_size)
    { }

+    bytes_ostream() noexcept : bytes_ostream(default_chunk_size) {}
+
    bytes_ostream(bytes_ostream&& o) noexcept
        : _begin(std::move(o._begin))
        , _current(o._current)
        , _size(o._size)
+        , _initial_chunk_size(o._initial_chunk_size)
    {
        o._current = nullptr;
        o._size = 0;
@@ -162,6 +171,7 @@ public:
        : _begin()
        , _current(nullptr)
        , _size(0)
+        , _initial_chunk_size(o._initial_chunk_size)
    {
        append(o);
    }
@@ -199,18 +209,20 @@ public:
        return place_holder<T>{alloc(sizeof(T))};
    }

+    [[gnu::always_inline]]
    value_type* write_place_holder(size_type size) {
        return alloc(size);
    }

    // Writes given sequence of bytes
+    [[gnu::always_inline]]
    inline void write(bytes_view v) {
        if (v.empty()) {
            return;
        }

        auto this_size = std::min(v.size(), size_t(current_space_left()));
-        if (this_size) {
+        if (__builtin_expect(this_size, true)) {
            memcpy(_current->data + _current->offset, v.begin(), this_size);
            _current->offset += this_size;
            _size += this_size;
@@ -219,11 +231,12 @@ public:

        while (!v.empty()) {
            auto this_size = std::min(v.size(), size_t(max_chunk_size()));
-            std::copy_n(v.begin(), this_size, alloc(this_size));
+            std::copy_n(v.begin(), this_size, alloc_new(this_size));
            v.remove_prefix(this_size);
        }
    }

+    [[gnu::always_inline]]
    void write(const char* ptr, size_t size) {
        write(bytes_view(reinterpret_cast<const signed char*>(ptr), size));
    }
@@ -393,6 +406,21 @@ public:
    bool operator!=(const bytes_ostream& other) const {
        return !(*this == other);
    }
+
+    // Makes this instance empty.
+    //
+    // The first buffer is not deallocated, so callers may rely on the
+    // fact that if they write less than the initial chunk size between
+    // the clear() calls then writes will not involve any memory allocations,
+    // except for the first write made on this instance.
+    void clear() {
+        if (_begin) {
+            _begin->offset = 0;
+            _size = 0;
+            _current = _begin.get();
+            _begin->next.reset();
+        }
+    }
 };

 template<>
--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -200,8 +200,9 @@ public:
        return _current_start;
    }

-    position_in_partition_view upper_bound() const {
-        return _current_end;
+    // Returns the upper bound of the last range in provided ranges set
+    position_in_partition_view uppermost_bound() const {
+        return position_in_partition_view::for_range_end(_ranges.back());
    }

    // When lower_bound() changes, this also does
--- a/compress.cc
+++ b/compress.cc
@@ -112,7 +112,7 @@ const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_kb";
 const sstring compression_parameters::CRC_CHECK_CHANCE = "crc_check_chance";

 compression_parameters::compression_parameters()
-    : compression_parameters(nullptr)
+    : compression_parameters(compressor::lz4)
 {}

 compression_parameters::~compression_parameters()
--- a/compress.hh
+++ b/compress.hh
@@ -118,6 +118,10 @@ public:
    std::map<sstring, sstring> get_options() const;
    bool operator==(const compression_parameters& other) const;
    bool operator!=(const compression_parameters& other) const;
+
+    static compression_parameters no_compression() {
+        return compression_parameters(nullptr);
+    }
 private:
    void validate_options(const std::map<sstring, sstring>&);
 };
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -242,6 +242,9 @@ batch_size_fail_threshold_in_kb: 50

 # The directory where hints files are stored if hinted handoff is enabled.
 # hints_directory: /var/lib/scylla/hints
+ 
+# The directory where hints files are stored for materialized-view updates
+# view_hints_directory: /var/lib/scylla/view_hints

 # See http://wiki.apache.org/cassandra/HintedHandoff
 # May either be "true" or "false" to enable globally, or contain a list
--- a/configure.py
+++ b/configure.py
@@ -197,7 +197,9 @@ class Thrift(object):

 def default_target_arch():
    if platform.machine() in ['i386', 'i686', 'x86_64']:
-        return 'nehalem'
+        return 'westmere'   # support PCLMUL
+    elif platform.machine() == 'aarch64':
+        return 'armv8-a+crc+crypto'
    else:
        return ''

@@ -271,6 +273,8 @@ scylla_tests = [
    'tests/perf/perf_sstable',
    'tests/cql_query_test',
    'tests/secondary_index_test',
+    'tests/json_cql_query_test',
+    'tests/filtering_test',
    'tests/storage_proxy_test',
    'tests/schema_change_test',
    'tests/mutation_reader_test',
@@ -306,6 +310,7 @@ scylla_tests = [
    'tests/log_heap_test',
    'tests/managed_vector_test',
    'tests/crc_test',
+    'tests/checksum_utils_test',
    'tests/flush_queue_test',
    'tests/dynamic_bitset_test',
    'tests/auth_test',
@@ -356,6 +361,7 @@ scylla_tests = [

 perf_tests = [
    'tests/perf/perf_mutation_readers',
+    'tests/perf/perf_checksum',
    'tests/perf/perf_mutation_fragment',
    'tests/perf/perf_idl',
 ]
@@ -431,6 +437,7 @@ extra_cxxflags = {}
 cassandra_interface = Thrift(source='interface/cassandra.thrift', service='Cassandra')

 scylla_core = (['database.cc',
+                'table.cc',
                'atomic_cell.cc',
                'schema.cc',
                'frozen_schema.cc',
@@ -461,6 +468,7 @@ scylla_core = (['database.cc',
                'compress.cc',
                'sstables/mp_row_consumer.cc',
                'sstables/sstables.cc',
+                'sstables/mc/writer.cc',
                'sstables/sstable_version.cc',
                'sstables/compress.cc',
                'sstables/row.cc',
@@ -470,7 +478,6 @@ scylla_core = (['database.cc',
                'sstables/compaction_manager.cc',
                'sstables/integrity_checked_file_impl.cc',
                'sstables/prepended_input_stream.cc',
-                'sstables/m_format_write_helpers.cc',
                'sstables/m_format_read_helpers.cc',
                'transport/event.cc',
                'transport/event_notifier.cc',
@@ -579,6 +586,7 @@ scylla_core = (['database.cc',
                'db/marshal/type_parser.cc',
                'db/batchlog_manager.cc',
                'db/view/view.cc',
+                'db/view/view_update_from_staging_generator.cc',
                'db/view/row_locking.cc',
                'index/secondary_index_manager.cc',
                'index/secondary_index.cc',
@@ -592,6 +600,7 @@ scylla_core = (['database.cc',
                'utils/managed_bytes.cc',
                'utils/exceptions.cc',
                'utils/config_file.cc',
+                'utils/gz/crc_combine.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -682,6 +691,7 @@ scylla_core = (['database.cc',
                'data/cell.cc',
                'multishard_writer.cc',
                'multishard_mutation_query.cc',
+                'reader_concurrency_semaphore.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
               )

@@ -744,6 +754,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/tracing.idl.hh',
        'idl/consistency_level.idl.hh',
        'idl/cache_temperature.idl.hh',
+        'idl/view.idl.hh',
        ]

 scylla_tests_dependencies = scylla_core + idls + [
@@ -773,6 +784,7 @@ pure_boost_tests = set([
    'tests/test-serialization',
    'tests/range_test',
    'tests/crc_test',
+    'tests/checksum_utils_test',
    'tests/managed_vector_test',
    'tests/dynamic_bitset_test',
    'tests/idl_test',
@@ -1001,6 +1013,8 @@ seastar_ldflags = args.user_ldflags
 seastar_flags += ['--compiler', args.cxx, '--c-compiler', args.cc, '--cflags=%s' % (seastar_cflags), '--ldflags=%s' % (seastar_ldflags),
                  '--c++-dialect=gnu++1z', '--optflags=%s' % (modes['release']['opt']), ]

+libdeflate_cflags = seastar_cflags
+
 status = subprocess.call([args.python, './configure.py'] + seastar_flags, cwd='seastar')

 if status != 0:
@@ -1100,6 +1114,9 @@ with open(buildfile, 'w') as f:
            command = {ninja} -C $subdir $target
            restat = 1
            description = NINJA $out
+        rule run
+            command = $in > $out
+            description = GEN $out
        rule copy
            command = cp $in $out
            description = COPY $out
@@ -1172,6 +1189,10 @@ with open(buildfile, 'w') as f:
            if binary.endswith('.a'):
                f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
            else:
+                objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
+                    'libdeflate/libdeflate.a'
+                ]])
+                objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
                if binary.startswith('tests/'):
                    local_libs = '$libs'
                    if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
@@ -1213,6 +1234,12 @@ with open(buildfile, 'w') as f:
                    antlr3_grammars.add(src)
                else:
                    raise Exception('No rule for ' + src)
+        compiles['$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o'] = '$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc'
+        compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
+        f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
+                                            '$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
+        f.write('build {}: link.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
+                                                '$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
        for obj in compiles:
            src = compiles[obj]
            gen_headers = list(ragels.keys())
@@ -1262,6 +1289,10 @@ with open(buildfile, 'w') as f:
            ''').format(**locals()))
        f.write('build build/$mode/scylla-package.tar: package build/{mode}/scylla build/{mode}/iotune\n'.format(**locals()))
        f.write('    mode = {mode}\n'.format(**locals()))
+        f.write('rule libdeflate.{mode}\n'.format(**locals()))
+        f.write('    command = make -C libdeflate BUILD_DIR=../build/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc} ../build/{mode}/libdeflate//libdeflate.a\n'.format(**locals()))
+        f.write('build build/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
+
    f.write('build {}: phony\n'.format(seastar_deps))
    f.write(textwrap.dedent('''\
        rule configure
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -38,44 +38,44 @@ private:
    static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
        return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
    }
+    static atomic_cell upgrade_cell(const abstract_type& new_type, const abstract_type& old_type, atomic_cell_view cell,
+                                    atomic_cell::collection_member cm = atomic_cell::collection_member::no) {
+        if (cell.is_live() && !old_type.is_counter()) {
+            if (cell.is_live_and_has_ttl()) {
+                return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl(), cm);
+            }
+            return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cm);
+        } else {
+            return atomic_cell(new_type, cell);
+        }
+    }
    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
        if (!is_compatible(new_def, old_type, kind) || cell.timestamp() <= new_def.dropped_at()) {
            return;
        }
-        auto new_cell = [&] {
-            if (cell.is_live() && !old_type->is_counter()) {
-                if (cell.is_live_and_has_ttl()) {
-                    return atomic_cell_or_collection(
-                        atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl())
-                    );
-                }
-                return atomic_cell_or_collection(
-                    atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize())
-                );
-            } else {
-                return atomic_cell_or_collection(*new_def.type, cell);
-            }
-        }();
-        dst.apply(new_def, std::move(new_cell));
+        dst.apply(new_def, upgrade_cell(*new_def.type, *old_type, cell));
    }
    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
        if (!is_compatible(new_def, old_type, kind)) {
            return;
        }
      cell.data.with_linearized([&] (bytes_view cell_bv) {
-        auto&& ctype = static_pointer_cast<const collection_type_impl>(old_type);
-        auto old_view = ctype->deserialize_mutation_form(cell_bv);
+        auto new_ctype = static_pointer_cast<const collection_type_impl>(new_def.type);
+        auto old_ctype = static_pointer_cast<const collection_type_impl>(old_type);
+        auto old_view = old_ctype->deserialize_mutation_form(cell_bv);

-        collection_type_impl::mutation_view new_view;
+        collection_type_impl::mutation new_view;
        if (old_view.tomb.timestamp > new_def.dropped_at()) {
            new_view.tomb = old_view.tomb;
        }
        for (auto& c : old_view.cells) {
            if (c.second.timestamp() > new_def.dropped_at()) {
-                new_view.cells.emplace_back(std::move(c));
+                new_view.cells.emplace_back(c.first, upgrade_cell(*new_ctype->value_comparator(), *old_ctype->value_comparator(), c.second, atomic_cell::collection_member::yes));
            }
        }
-        dst.apply(new_def, ctype->serialize_mutation_form(std::move(new_view)));
+        if (new_view.tomb || !new_view.cells.empty()) {
+            dst.apply(new_def, new_ctype->serialize_mutation_form(std::move(new_view)));
+        }
      });
    }
 public:
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -470,6 +470,7 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
        std::vector<::shared_ptr<cql3::column_identifier::raw>> column_names;
        std::vector<::shared_ptr<cql3::term::raw>> values;
        bool if_not_exists = false;
+        bool default_unset = false;
        ::shared_ptr<cql3::term::raw> json_value;
    }
    : K_INSERT K_INTO cf=columnFamilyName
@@ -487,13 +488,15 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
              }
        | K_JSON
          json_token=jsonValue { json_value = $json_token.value; }
+            ( K_DEFAULT K_UNSET { default_unset = true; } | K_DEFAULT K_NULL )?
            ( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
            ( usingClause[attrs] )?
              {
              $expr = ::make_shared<raw::insert_json_statement>(std::move(cf),
                                                       std::move(attrs),
                                                       std::move(json_value),
-                                                       if_not_exists);
+                                                       if_not_exists,
+                                                       default_unset);
              }
        )
    ;
@@ -1835,6 +1838,8 @@ K_OR:          O R;
 K_REPLACE:     R E P L A C E;
 K_DETERMINISTIC: D E T E R M I N I S T I C;
 K_JSON:        J S O N;
+K_DEFAULT:     D E F A U L T;
+K_UNSET:       U N S E T;

 K_EMPTY:       E M P T Y;

--- a/cql3/error_collector.hh
+++ b/cql3/error_collector.hh
@@ -67,6 +67,12 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
     */
    const sstring_view _query;

+    /**
+     * An empty bitset to be used as a workaround for AntLR null dereference
+     * bug.
+     */
+    static typename ExceptionBaseType::BitsetListType _empty_bit_list;
+
 public:

    /**
@@ -144,6 +150,14 @@ private:
            break;
        }
        default:
+            // AntLR Exception class has a bug of dereferencing a null
+            // pointer in the displayRecognitionError. The following
+            // if statement makes sure it will not be null before the
+            // call to that function (displayRecognitionError).
+            // bug reference: https://github.com/antlr/antlr3/issues/191
+            if (!ex->get_expectingSet()) {
+                ex->set_expectingSet(&_empty_bit_list);
+            }
            ex->displayRecognitionError(token_names, msg);
        }
        return msg.str();
@@ -345,4 +359,8 @@ private:
 #endif
 };

+template<typename RecognizerType, typename TokenType, typename ExceptionBaseType>
+typename ExceptionBaseType::BitsetListType
+error_collector<RecognizerType,TokenType,ExceptionBaseType>::_empty_bit_list = typename ExceptionBaseType::BitsetListType();
+
 }
--- a/cql3/restrictions/primary_key_restrictions.hh
+++ b/cql3/restrictions/primary_key_restrictions.hh
@@ -100,12 +100,28 @@ public:
    bool has_unrestricted_components(const schema& schema) const;

    virtual bool needs_filtering(const schema& schema) const;
+
+    // How long a prefix of the restrictions could have resulted in
+    // need_filtering() == false. These restrictions do not need to be
+    // applied during filtering.
+    // For example, if we have the filter "c1 < 3 and c2 > 3", c1 does
+    // not need filtering (just a read stopping at c1=3) but c2 does,
+    // so num_prefix_columns_that_need_not_be_filtered() will be 1.
+    virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const {
+        return 0;
+    }
+
    virtual bool is_all_eq() const {
        return false;
    }
    virtual size_t prefix_size() const {
        return 0;
    }
+
+    size_t prefix_size(const schema_ptr schema) const {
+        return 0;
+    }
+
 };

 template<>
@@ -129,5 +145,23 @@ inline bool primary_key_restrictions<clustering_key>::needs_filtering(const sche
    return false;
 }

+template<>
+inline size_t primary_key_restrictions<clustering_key>::prefix_size(const schema_ptr schema) const {
+    size_t count = 0;
+    if (schema->clustering_key_columns().empty()) {
+        return count;
+    }
+    auto column_defs = get_column_defs();
+    column_id expected_column_id = schema->clustering_key_columns().begin()->id;
+    for (auto&& cdef : column_defs) {
+        if (schema->position(*cdef) != expected_column_id) {
+            return count;
+        }
+        expected_column_id++;
+        count++;
+    }
+    return count;
+}
+
 }
 }
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -166,19 +166,7 @@ public:
    }

    virtual size_t prefix_size() const override {
-        size_t count = 0;
-        if (_schema->clustering_key_columns().empty()) {
-            return count;
-        }
-        column_id expected_column_id = _schema->clustering_key_columns().begin()->id;
-        for (const auto& restriction_entry : _restrictions->restrictions()) {
-            if (_schema->position(*restriction_entry.first) != expected_column_id) {
-                return count;
-            }
-            expected_column_id++;
-            count++;
-        }
-        return count;
+        return primary_key_restrictions<ValueType>::prefix_size(_schema);
    }

    ::shared_ptr<single_column_primary_key_restrictions<clustering_key>> get_longest_prefix_restrictions() {
@@ -419,6 +407,7 @@ public:
    }

    virtual bool needs_filtering(const schema& schema) const override;
+    virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const override;
 };

 template<>
@@ -499,6 +488,39 @@ inline bool single_column_primary_key_restrictions<clustering_key>::needs_filter
    return false;
 }

+// How many of the restrictions (in column order) do not need filtering
+// because they are implemented as a slice (potentially, a contiguous disk
+// read). For example, if we have the filter "c1 < 3 and c2 > 3", c1 does not
+// need filtering but c2 does so num_prefix_columns_that_need_not_be_filtered
+// will be 1.
+// The implementation of num_prefix_columns_that_need_not_be_filtered() is
+// closely tied to that of needs_filtering() above - basically, if only the
+// first num_prefix_columns_that_need_not_be_filtered() restrictions existed,
+// then needs_filtering() would have returned false.
+template<>
+inline unsigned single_column_primary_key_restrictions<clustering_key>::num_prefix_columns_that_need_not_be_filtered() const {
+    column_id position = 0;
+    unsigned int count = 0;
+    for (const auto& restriction : _restrictions->restrictions() | boost::adaptors::map_values) {
+        if (restriction->is_contains() || position != restriction->get_column_def().id) {
+            return count;
+        }
+        if (!restriction->is_slice()) {
+            position = restriction->get_column_def().id + 1;
+        }
+        count++;
+    }
+    return count;
+}
+
+template<>
+inline unsigned single_column_primary_key_restrictions<partition_key>::num_prefix_columns_that_need_not_be_filtered() const {
+    // skip_filtering() is currently called only for clustering key
+    // restrictions, so it doesn't matter what we return here.
+    return 0;
+}
+
+
 }
 }

--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -337,6 +337,52 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
    return _index_restrictions;
 }

+std::optional<secondary_index::index> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
+    for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
+        for (const auto& cdef : restriction->get_column_defs()) {
+            for (auto index : sim.list_indexes()) {
+                if (index.depends_on(*cdef)) {
+                    return std::make_optional<secondary_index::index>(std::move(index));
+                }
+            }
+        }
+    }
+    return std::nullopt;
+}
+
+std::vector<const column_definition*> statement_restrictions::get_column_defs_for_filtering(database& db) const {
+    std::vector<const column_definition*> column_defs_for_filtering;
+    if (need_filtering()) {
+        auto& sim = db.find_column_family(_schema).get_index_manager();
+        std::optional<secondary_index::index> opt_idx = find_idx(sim);
+        auto column_uses_indexing = [&opt_idx] (const column_definition* cdef) {
+            return opt_idx && opt_idx->depends_on(*cdef);
+        };
+        if (_partition_key_restrictions->needs_filtering(*_schema)) {
+            for (auto&& cdef : _partition_key_restrictions->get_column_defs()) {
+                if (!column_uses_indexing(cdef)) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
+            column_id first_filtering_id = _schema->clustering_key_columns().begin()->id +
+                    _clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
+            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
+                if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef)) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        for (auto&& cdef : _nonprimary_key_restrictions->get_column_defs()) {
+            if (!column_uses_indexing(cdef)) {
+                column_defs_for_filtering.emplace_back(cdef);
+            }
+        }
+    }
+    return column_defs_for_filtering;
+}
+
 void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
    // If there is a queriable index, no special condition are required on the other restrictions.
    // But we still need to know 2 things:
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -163,6 +163,20 @@ public:
        return _clustering_columns_restrictions;
    }

+    /**
+     * Builds a possibly empty collection of column definitions that will be used for filtering
+     * @param db - the database context
+     * @return A list with the column definitions needed for filtering.
+     */
+    std::vector<const column_definition*> get_column_defs_for_filtering(database& db) const;
+
+    /**
+     * Determines the index to be used with the restriction.
+     * @param db - the database context (for extracting index manager)
+     * @return If an index can be used, an optional containing this index, otherwise an empty optional.
+     */
+    std::optional<secondary_index::index> find_idx(secondary_index::secondary_index_manager& sim) const;
+
    /**
     * Checks if the partition key has some unrestricted components.
     * @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -156,9 +156,9 @@ public:
        return _factories->uses_function(ks_name, function_name);
    }

-    virtual uint32_t add_column_for_ordering(const column_definition& c) override {
-        uint32_t index = selection::add_column_for_ordering(c);
-        _factories->add_selector_for_ordering(c, index);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c) override {
+        uint32_t index = selection::add_column_for_post_processing(c);
+        _factories->add_selector_for_post_processing(c, index);
        return index;
    }

@@ -227,7 +227,7 @@ protected:
    return simple_selection::make(schema, std::move(columns), false);
 }

-uint32_t selection::add_column_for_ordering(const column_definition& c) {
+uint32_t selection::add_column_for_post_processing(const column_definition& c) {
    _columns.push_back(&c);
    _metadata->add_non_serialized_column(c.column_specification);
    return _columns.size() - 1;
@@ -339,14 +339,14 @@ std::unique_ptr<result_set> result_set_builder::build() {
    return std::move(_result_set);
 }

-bool result_set_builder::restrictions_filter::operator()(const selection& selection,
+bool result_set_builder::restrictions_filter::do_filter(const selection& selection,
                                                         const std::vector<bytes>& partition_key,
                                                         const std::vector<bytes>& clustering_key,
                                                         const query::result_row_view& static_row,
                                                         const query::result_row_view& row) const {
    static logging::logger rlogger("restrictions_filter");

-    if (_current_partition_key_does_not_match || _current_static_row_does_not_match) {
+    if (_current_partition_key_does_not_match || _current_static_row_does_not_match || _remaining == 0) {
        return false;
    }

@@ -427,6 +427,20 @@ bool result_set_builder::restrictions_filter::operator()(const selection& select
    return true;
 }

+bool result_set_builder::restrictions_filter::operator()(const selection& selection,
+                                                         const std::vector<bytes>& partition_key,
+                                                         const std::vector<bytes>& clustering_key,
+                                                         const query::result_row_view& static_row,
+                                                         const query::result_row_view& row) const {
+    const bool accepted = do_filter(selection, partition_key, clustering_key, static_row, row);
+    if (!accepted) {
+        ++_rows_dropped;
+    } else if (_remaining > 0) {
+        --_remaining;
+    }
+    return accepted;
+}
+
 api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
    return _timestamps[idx];
 }
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -176,7 +176,7 @@ public:
    static ::shared_ptr<selection> wildcard(schema_ptr schema);
    static ::shared_ptr<selection> for_columns(schema_ptr schema, std::vector<const column_definition*> columns);

-    virtual uint32_t add_column_for_ordering(const column_definition& c);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c);

    virtual bool uses_function(const sstring &ks_name, const sstring& function_name) const {
        return false;
@@ -259,20 +259,31 @@ public:
        }
        void reset() {
        }
+        uint32_t get_rows_dropped() const {
+            return 0;
+        }
    };
    class restrictions_filter {
        ::shared_ptr<restrictions::statement_restrictions> _restrictions;
        const query_options& _options;
        mutable bool _current_partition_key_does_not_match = false;
        mutable bool _current_static_row_does_not_match = false;
+        mutable uint32_t _rows_dropped = 0;
+        mutable uint32_t _remaining = 0;
    public:
        restrictions_filter() = default;
-        explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options) : _restrictions(restrictions), _options(options) {}
+        explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options, uint32_t remaining) : _restrictions(restrictions), _options(options), _remaining(remaining) {}
        bool operator()(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
        void reset() {
            _current_partition_key_does_not_match = false;
            _current_static_row_does_not_match = false;
+            _rows_dropped = 0;
        }
+        uint32_t get_rows_dropped() const {
+            return _rows_dropped;
+        }
+    private:
+        bool do_filter(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
    };

    result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
@@ -372,7 +383,7 @@ public:
            }
        }

-        void accept_partition_end(const query::result_row_view& static_row) {
+        uint32_t accept_partition_end(const query::result_row_view& static_row) {
            if (_row_count == 0) {
                _builder.new_row();
                auto static_row_iterator = static_row.iterator();
@@ -386,6 +397,7 @@ public:
                    }
                }
            }
+            return _filter.get_rows_dropped();
        }
    };

--- a/cql3/selection/selector_factories.cc
+++ b/cql3/selection/selector_factories.cc
@@ -53,6 +53,7 @@ selector_factories::selector_factories(std::vector<::shared_ptr<selectable>> sel
    : _contains_write_time_factory(false)
    , _contains_ttl_factory(false)
    , _number_of_aggregate_factories(0)
+    , _number_of_factories_for_post_processing(0)
 {
    _factories.reserve(selectables.size());

@@ -76,8 +77,9 @@ bool selector_factories::uses_function(const sstring& ks_name, const sstring& fu
    return false;
 }

-void selector_factories::add_selector_for_ordering(const column_definition& def, uint32_t index) {
+void selector_factories::add_selector_for_post_processing(const column_definition& def, uint32_t index) {
    _factories.emplace_back(simple_selector::new_factory(def.name_as_text(), index, def.type));
+    ++_number_of_factories_for_post_processing;
 }

 std::vector<::shared_ptr<selector>> selector_factories::new_instances() const {
--- a/cql3/selection/selector_factories.hh
+++ b/cql3/selection/selector_factories.hh
@@ -74,6 +74,11 @@ private:
     */
    uint32_t _number_of_aggregate_factories;

+    /**
+     * The number of factories that are only for post processing.
+     */
+    uint32_t _number_of_factories_for_post_processing;
+
 public:
    /**
     * Creates a new <code>SelectorFactories</code> instance and collect the column definitions.
@@ -97,11 +102,12 @@ public:
    bool uses_function(const sstring& ks_name, const sstring& function_name) const;

    /**
-     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY purposes.
+     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY or post
+     * processing purposes.
     * @param def the column that is needed for ordering
     * @param index the index of the column definition in the Selection's list of columns
     */
-    void add_selector_for_ordering(const column_definition& def, uint32_t index);
+    void add_selector_for_post_processing(const column_definition& def, uint32_t index);

    /**
     * Checks if this <code>SelectorFactories</code> contains only factories for aggregates.
@@ -111,7 +117,7 @@ public:
     */
    bool contains_only_aggregate_functions() const {
        auto size = _factories.size();
-        return size != 0 && _number_of_aggregate_factories == size;
+        return size != 0 && _number_of_aggregate_factories  == (size - _number_of_factories_for_post_processing);
    }

    /**
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -276,7 +276,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a

        auto type = validate_alter(schema, *def, *validator);
        // In any case, we update the column definition
-        cfm.with_altered_column_type(column_name->name(), type);
+        cfm.alter_column_type(column_name->name(), type);

        // We also have to validate the view types here. If we have a view which includes a column as part of
        // the clustering key, we need to make sure that it is indeed compatible.
@@ -285,7 +285,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
            if (view_def) {
                schema_builder builder(view);
                auto view_type = validate_alter(view, *view_def, *validator);
-                builder.with_altered_column_type(column_name->name(), std::move(view_type));
+                builder.alter_column_type(column_name->name(), std::move(view_type));
                view_updates.push_back(view_ptr(builder.build()));
            }
        }
@@ -306,7 +306,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
        } else {
            for (auto&& column_def : boost::range::join(schema->static_columns(), schema->regular_columns())) { // find
                if (column_def.name() == column_name->name()) {
-                    cfm.without_column(column_name->name());
+                    cfm.remove_column(column_name->name());
                    break;
                }
            }
@@ -349,7 +349,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
            auto to = entry.second->prepare_column_identifier(schema);

            validate_column_rename(db, *schema, *from, *to);
-            cfm.with_column_rename(from->name(), to->name());
+            cfm.rename_column(from->name(), to->name());

            // If the view includes a renamed column, it must be renamed in
            // the view table and the definition.
@@ -360,7 +360,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
                    auto view_from = entry.first->prepare_column_identifier(view);
                    auto view_to = entry.second->prepare_column_identifier(view);
                    validate_column_rename(db, *view, *view_from, *view_to);
-                    builder.with_column_rename(view_from->name(), view_to->name());
+                    builder.rename_column(view_from->name(), view_to->name());

                    auto new_where = util::rename_column_in_where_clause(
                            view->view_info()->where_clause(),
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -110,7 +110,7 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b
            if (t_opt) {
                modified = true;
                // We need to update this column
-                cfm.with_altered_column_type(column.name(), *t_opt);
+                cfm.alter_column_type(column.name(), *t_opt);
            }
        }
        if (modified) {
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -88,6 +88,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
        throw exceptions::invalid_request_exception("Secondary indexes are not supported on materialized views");
    }

+    if (schema->is_dense()) {
+        throw exceptions::invalid_request_exception(
+                "Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns");
+    }
+
    std::vector<::shared_ptr<index_target>> targets;
    for (auto& raw_target : _raw_targets) {
        targets.emplace_back(raw_target->prepare(schema));
@@ -109,6 +114,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
                    sprint("No column definition found for column %s", *target->column));
        }

+        //NOTICE(sarna): Should be lifted after resolving issue #2963
+        if (cd->is_static()) {
+            throw exceptions::invalid_request_exception("Indexing static columns is not implemented yet.");
+        }
+
        if (cd->type->references_duration()) {
            using request_validations::check_false;
            const auto& ty = *cd->type;
@@ -122,8 +132,7 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
        }

        // Origin TODO: we could lift that limitation
-        if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) &&
-            cd->kind != column_kind::regular_column) {
+        if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) && cd->is_primary_key()) {
            throw exceptions::invalid_request_exception(
                    "Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables");
        }
@@ -137,10 +146,15 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c

        bool is_map = dynamic_cast<const collection_type_impl *>(cd->type.get()) != nullptr
                      && dynamic_cast<const collection_type_impl *>(cd->type.get())->is_map();
-        bool is_frozen_collection = cd->type->is_collection() && !cd->type->is_multi_cell();
+        bool is_collection = cd->type->is_collection();
+        bool is_frozen_collection = is_collection && !cd->type->is_multi_cell();

        if (is_frozen_collection) {
            validate_for_frozen_collection(target);
+        } else if (is_collection) {
+            // NOTICE(sarna): should be lifted after #2962 (indexes on non-frozen collections) is implemented
+            throw exceptions::invalid_request_exception(
+                    sprint("Cannot create secondary index on non-frozen collection column %s", cd->name_as_text()));
        } else {
            validate_not_full_index(target);
            validate_is_values_index_if_target_column_not_collection(cd, target);
--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -84,7 +84,6 @@ create_view_statement::create_view_statement(
    , _clustering_keys{clustering_keys}
    , _if_not_exists{if_not_exists}
 {
-    service::get_local_storage_proxy().get_db().local().get_config().check_experimental("Creating materialized views");
    if (!service::get_local_storage_service().cluster_supports_materialized_views()) {
        throw exceptions::invalid_request_exception("Can't create materialized views until the whole cluster has been upgraded");
    }
@@ -315,6 +314,27 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
        throw exceptions::invalid_request_exception(sprint("No columns are defined for Materialized View other than primary key"));
    }

+    // The unique feature of a filter by a non-key column is that the
+    // value of such column can be updated - and also be expired with TTL
+    // and cause the view row to appear and disappear. We don't currently
+    // support support this case - see issue #3430, and neither does
+    // Cassandra - see see CASSANDRA-13798 and CASSANDRA-13832.
+    // Actually, as CASSANDRA-13798 explains, the problem is "the liveness of
+    // view row is now depending on multiple base columns (multiple filtered
+    // non-pk base column + base column used in view pk)". When the filtered
+    // column *is* the base column added to the view pk, we don't have this
+    // problem. And this case actually works correctly.
+    auto non_pk_restrictions = restrictions->get_non_pk_restriction();
+    if (non_pk_restrictions.size() == 1 && has_non_pk_column &&
+            std::find(target_primary_keys.begin(), target_primary_keys.end(), non_pk_restrictions.cbegin()->first) != target_primary_keys.end()) {
+        // This case (filter by new PK column of the view) works, as explained above
+    } else if (!non_pk_restrictions.empty()) {
+        auto column_names = ::join(", ", non_pk_restrictions | boost::adaptors::map_keys | boost::adaptors::transformed(std::mem_fn(&column_definition::name_as_text)));
+        throw exceptions::invalid_request_exception(sprint(
+                "Non-primary key columns cannot be restricted in the SELECT statement used for materialized view %s creation (got restrictions on: %s)",
+                column_family(), column_names));
+    }
+
    schema_builder builder{keyspace(), column_family()};
    auto add_columns = [this, &builder] (std::vector<const column_definition*>& defs, column_kind kind) mutable {
        for (auto* def : defs) {
--- a/cql3/statements/index_prop_defs.cc
+++ b/cql3/statements/index_prop_defs.cc
@@ -49,7 +49,7 @@ void cql3::statements::index_prop_defs::validate() {
    property_definitions::validate(keywords);

    if (is_custom && !custom_class) {
-        throw exceptions::invalid_request_exception("CUSTOM index requires specifiying the index class");
+        throw exceptions::invalid_request_exception("CUSTOM index requires specifying the index class");
    }

    if (!is_custom && custom_class) {
@@ -64,6 +64,16 @@ void cql3::statements::index_prop_defs::validate() {
                sprint("Cannot specify %s as a CUSTOM option",
                        db::index::secondary_index::custom_index_option_name));
    }
+
+    // Currently, Scylla does not support *any* class of custom index
+    // implementation. If in the future we do (e.g., SASI, or something
+    // new), we'll need to check for valid values here.
+    if (is_custom && custom_class) {
+        throw exceptions::invalid_request_exception(
+                format("Unsupported CUSTOM INDEX class {}. Note that currently, Scylla does not support SASI or any other CUSTOM INDEX class.",
+                        *custom_class));
+
+    }
 }

 index_options_map
--- a/cql3/statements/raw/insert_statement.hh
+++ b/cql3/statements/raw/insert_statement.hh
@@ -87,6 +87,7 @@ private:
    ::shared_ptr<attributes::raw> _attrs;
    ::shared_ptr<term::raw> _json_value;
    bool _if_not_exists;
+    bool _default_unset;
 public:
    /**
     * A parsed <code>INSERT JSON</code> statement.
@@ -95,7 +96,7 @@ public:
     * @param json_value JSON string representing names and values
     * @param attrs additional attributes for statement (CL, timestamp, timeToLive)
     */
-    insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists);
+    insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists, bool default_unset);

    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
                ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats) override;
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -141,6 +141,10 @@ private:
    /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
    void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);

+    void ensure_filtering_columns_retrieval(database& db,
+                                            ::shared_ptr<selection::selection> selection,
+                                            ::shared_ptr<restrictions::statement_restrictions> restrictions);
+
    bool contains_alias(::shared_ptr<column_identifier> name);

    ::shared_ptr<column_specification> limit_receiver();
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -383,8 +383,9 @@ select_statement::do_execute(service::storage_proxy& proxy,
    int32_t limit = get_limit(options);
    auto now = gc_clock::now();

+    const bool restrictions_need_filtering = _restrictions->need_filtering();
    ++_stats.reads;
-    _stats.filtered_reads += _restrictions->need_filtering();
+    _stats.filtered_reads += restrictions_need_filtering;

    auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
        make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));
@@ -396,37 +397,41 @@ select_statement::do_execute(service::storage_proxy& proxy,
    // An aggregation query will never be paged for the user, but we always page it internally to avoid OOM.
    // If we user provided a page_size we'll use that to page internally (because why not), otherwise we use our default
    // Note that if there are some nodes in the cluster with a version less than 2.0, we can't use paging (CASSANDRA-6707).
-    auto aggregate = _selection->is_aggregate();
-    if (aggregate && page_size <= 0) {
+    const bool aggregate = _selection->is_aggregate();
+    const bool nonpaged_filtering = restrictions_need_filtering && page_size <= 0;
+    if (aggregate || nonpaged_filtering) {
        page_size = DEFAULT_COUNT_PAGE_SIZE;
    }

    auto key_ranges = _restrictions->get_partition_key_ranges(options);

-    if (!aggregate && (page_size <= 0
+    if (!aggregate && !restrictions_need_filtering && (page_size <= 0
            || !service::pager::query_pagers::may_need_paging(*_schema, page_size,
                    *command, key_ranges))) {
        return execute(proxy, command, std::move(key_ranges), state, options, now);
    }

    command->slice.options.set<query::partition_slice::option::allow_short_read>();
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout_duration = options.get_timeout_config().*get_timeout_config_selector();
    auto p = service::pager::query_pagers::pager(_schema, _selection,
-            state, options, command, std::move(key_ranges), _stats, _restrictions->need_filtering() ? _restrictions : nullptr);
+            state, options, command, std::move(key_ranges), _stats, restrictions_need_filtering ? _restrictions : nullptr);

-    if (aggregate) {
+    if (aggregate || nonpaged_filtering) {
        return do_with(
                cql3::selection::result_set_builder(*_selection, now,
                        options.get_cql_serialization_format()),
-                [this, p, page_size, now, timeout](auto& builder) {
+                [this, p, page_size, now, timeout_duration, restrictions_need_filtering](auto& builder) {
                    return do_until([p] {return p->is_exhausted();},
-                            [p, &builder, page_size, now, timeout] {
+                            [p, &builder, page_size, now, timeout_duration] {
+                                auto timeout = db::timeout_clock::now() + timeout_duration;
                                return p->fetch_page(builder, page_size, now, timeout);
                            }
-                    ).then([this, &builder] {
+                    ).then([this, &builder, restrictions_need_filtering] {
                                auto rs = builder.build();
+                                if (restrictions_need_filtering) {
+                                    _stats.filtered_rows_matched_total += rs->size();
+                                }
                                update_stats_rows_read(rs->size());
-                                _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
                                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
                            });
@@ -439,7 +444,8 @@ select_statement::do_execute(service::storage_proxy& proxy,
                        " you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
    }

-    if (_selection->is_trivial() && !_restrictions->need_filtering()) {
+    auto timeout = db::timeout_clock::now() + timeout_duration;
+    if (_selection->is_trivial() && !restrictions_need_filtering) {
        return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p, limit] (result_generator generator) {
            auto meta = [&] () -> shared_ptr<const cql3::metadata> {
                if (!p->is_exhausted()) {
@@ -458,14 +464,16 @@ select_statement::do_execute(service::storage_proxy& proxy,
    }

    return p->fetch_page(page_size, now, timeout).then(
-            [this, p, &options, limit, now](std::unique_ptr<cql3::result_set> rs) {
+            [this, p, &options, now, restrictions_need_filtering](std::unique_ptr<cql3::result_set> rs) {

                if (!p->is_exhausted()) {
                    rs->get_metadata().set_paging_state(p->state());
                }

+                if (restrictions_need_filtering) {
+                    _stats.filtered_rows_matched_total += rs->size();
+                }
                update_stats_rows_read(rs->size());
-                _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
            });
@@ -492,15 +500,9 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const clustering_
    return KeyType::from_range(exploded_base_key);
 }

-future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::execute_base_query(
-        service::storage_proxy& proxy,
-        dht::partition_range_vector&& partition_ranges,
-        service::query_state& state,
-        const query_options& options,
-        gc_clock::time_point now,
-        ::shared_ptr<const service::pager::paging_state> paging_state) {
-    auto cmd = ::make_lw_shared<query::read_command>(
+lw_shared_ptr<query::read_command>
+indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) {
+    lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
            _schema->id(),
            _schema->version(),
            make_partition_slice(options),
@@ -510,9 +512,25 @@ indexed_table_select_statement::execute_base_query(
            query::max_partitions,
            utils::UUID(),
            options.get_timestamp(state));
-    if (options.get_page_size() > 0) {
+    if (use_paging) {
        cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
+        cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
+        if (_schema->clustering_key_size() > 0) {
+            cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
+        }
    }
+    return cmd;
+}
+
+future<shared_ptr<cql_transport::messages::result_message>>
+indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        dht::partition_range_vector&& partition_ranges,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
    dht::partition_range_vector per_vnode_ranges;
    per_vnode_ranges.reserve(partition_ranges.size());
@@ -586,19 +604,7 @@ indexed_table_select_statement::execute_base_query(
        const query_options& options,
        gc_clock::time_point now,
        ::shared_ptr<const service::pager::paging_state> paging_state) {
-    auto cmd = make_lw_shared<query::read_command>(
-            _schema->id(),
-            _schema->version(),
-            make_partition_slice(options),
-            get_limit(options),
-            now,
-            tracing::make_trace_info(state.get_trace_state()),
-            query::max_partitions,
-            utils::UUID(),
-            options.get_timestamp(state));
-    if (options.get_page_size() > 0) {
-        cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
-    }
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();

    struct base_query_state {
@@ -714,7 +720,8 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
                                  const query_options& options,
                                  gc_clock::time_point now)
 {
-    bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !_restrictions->need_filtering();
+    const bool restrictions_need_filtering = _restrictions->need_filtering();
+    const bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !restrictions_need_filtering;
    if (fast_path) {
        return make_shared<cql_transport::messages::result_message::rows>(result(
            result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
@@ -724,12 +731,12 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu

    cql3::selection::result_set_builder builder(*_selection, now,
            options.get_cql_serialization_format());
-    if (_restrictions->need_filtering()) {
+    if (restrictions_need_filtering) {
        results->ensure_counts();
        _stats.filtered_rows_read_total += *results->row_count();
        query::result_view::consume(*results, cmd->slice,
                cql3::selection::result_set_builder::visitor(builder, *_schema,
-                        *_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions, options)));
+                        *_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
    } else {
        query::result_view::consume(*results, cmd->slice,
                cql3::selection::result_set_builder::visitor(builder, *_schema,
@@ -745,7 +752,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
        rs->trim(cmd->row_limit);
    }
    update_stats_rows_read(rs->size());
-    _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
+    _stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
    return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
 }

@@ -774,7 +781,8 @@ indexed_table_select_statement::prepare(database& db,
                                        ordering_comparator_type ordering_comparator,
                                        ::shared_ptr<term> limit, cql_stats &stats)
 {
-    auto index_opt = find_idx(db, schema, restrictions);
+    auto& sim = db.find_column_family(schema).get_index_manager();
+    auto index_opt = restrictions->find_idx(sim);
    if (!index_opt) {
        throw std::runtime_error("No index found.");
    }
@@ -798,24 +806,6 @@ indexed_table_select_statement::prepare(database& db,

 }

-
-stdx::optional<secondary_index::index> indexed_table_select_statement::find_idx(database& db,
-                                                                                schema_ptr schema,
-                                                                                ::shared_ptr<restrictions::statement_restrictions> restrictions)
-{
-    auto& sim = db.find_column_family(schema).get_index_manager();
-    for (::shared_ptr<cql3::restrictions::restrictions> restriction : restrictions->index_restrictions()) {
-        for (const auto& cdef : restriction->get_column_defs()) {
-            for (auto index : sim.list_indexes()) {
-                if (index.depends_on(*cdef)) {
-                    return stdx::make_optional<secondary_index::index>(std::move(index));
-                }
-            }
-        }
-    }
-    return stdx::nullopt;
-}
-
 indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
                                                           ::shared_ptr<parameters> parameters,
                                                           ::shared_ptr<selection::selection> selection,
@@ -882,7 +872,6 @@ static void append_base_key_to_index_ck(std::vector<bytes_view>& exploded_index_
    auto paging_state_copy = ::make_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
-    paging_state_copy->set_remaining(query::max_rows);
    return std::move(paging_state_copy);
 }

@@ -1219,6 +1208,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
    }

    check_needs_filtering(restrictions);
+    ensure_filtering_columns_retrieval(db, selection, restrictions);

    ::shared_ptr<cql3::statements::select_statement> stmt;
    if (restrictions->uses_secondary_indexing()) {
@@ -1357,7 +1347,7 @@ select_statement::get_ordering_comparator(schema_ptr schema,
        }
        auto index = selection->index_of(*def);
        if (index < 0) {
-            index = selection->add_column_for_ordering(*def);
+            index = selection->add_column_for_post_processing(*def);
        }

        sorters.emplace_back(index, def->type);
@@ -1444,6 +1434,23 @@ void select_statement::check_needs_filtering(::shared_ptr<restrictions::statemen
    }
 }

+/**
+ * Adds columns that are needed for the purpose of filtering to the selection.
+ * The columns that are added to the selection are columns that
+ * are needed for filtering on the coordinator but are not part of the selection.
+ * The columns are added with a meta-data indicating they are not to be returned
+ * to the user.
+ */
+void select_statement::ensure_filtering_columns_retrieval(database& db,
+                                        ::shared_ptr<selection::selection> selection,
+                                        ::shared_ptr<restrictions::statement_restrictions> restrictions) {
+    for (auto&& cdef : restrictions->get_column_defs_for_filtering(db)) {
+        if (!selection->has_column(*cdef)) {
+            selection->add_column_for_post_processing(*cdef);
+        }
+    }
+}
+
 bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
    return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
        return raw->alias && *name == *raw->alias;
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -186,10 +186,6 @@ public:
                                   schema_ptr view_schema);

 private:
-    static stdx::optional<secondary_index::index> find_idx(database& db,
-                                                           schema_ptr schema,
-                                                           ::shared_ptr<restrictions::statement_restrictions> restrictions);
-
    virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
                                                                                     service::query_state& state, const query_options& options) override;

@@ -214,6 +210,9 @@ private:
            gc_clock::time_point now,
            ::shared_ptr<const service::pager::paging_state> paging_state);

+    lw_shared_ptr<query::read_command>
+    prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);
+
    future<shared_ptr<cql_transport::messages::result_message>>
    execute_base_query(
            service::storage_proxy& proxy,
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -84,8 +84,11 @@ parse(const sstring& json_string, const std::vector<column_definition>& expected
    for (const auto& def : expected_receivers) {
        sstring cql_name = def.name_as_text();
        auto value_it = prepared_map.find(cql_name);
-        if (value_it == prepared_map.end() || value_it->second.isNull()) {
+        if (value_it == prepared_map.end()) {
+            continue;
+        } else if (value_it->second.isNull()) {
            json_map.emplace(std::move(cql_name), bytes_opt{});
+            prepared_map.erase(value_it);
        } else {
            json_map.emplace(std::move(cql_name), def.type->from_json_object(value_it->second, sf));
            prepared_map.erase(value_it);
@@ -255,8 +258,12 @@ void insert_prepared_json_statement::execute_operations_for_key(mutation& m, con
            throw exceptions::invalid_request_exception(sprint("Cannot set the value of counter column %s in JSON", def.name_as_text()));
        }

-        auto value = json_cache->at(def.name_as_text());
-        execute_set_value(m, prefix, params, def, value);
+        auto it = json_cache->find(def.name_as_text());
+        if (it != json_cache->end()) {
+            execute_set_value(m, prefix, params, def, it->second);
+        } else if (!_default_unset) {
+            execute_set_value(m, prefix, params, def, bytes_opt{});
+        }
    }
 }

@@ -322,12 +329,14 @@ insert_statement::prepare_internal(database& db, schema_ptr schema,
 insert_json_statement::insert_json_statement(  ::shared_ptr<cf_name> name,
                                               ::shared_ptr<attributes::raw> attrs,
                                               ::shared_ptr<term::raw> json_value,
-                                               bool if_not_exists)
+                                               bool if_not_exists,
+                                               bool default_unset)
    : raw::modification_statement{name, attrs, conditions_vector{}, if_not_exists, false}
    , _name(name)
    , _attrs(attrs)
    , _json_value(json_value)
-    , _if_not_exists(if_not_exists) { }
+    , _if_not_exists(if_not_exists)
+    , _default_unset(default_unset) { }

 ::shared_ptr<cql3::statements::modification_statement>
 insert_json_statement::prepare_internal(database& db, schema_ptr schema,
@@ -337,7 +346,7 @@ insert_json_statement::prepare_internal(database& db, schema_ptr schema,
    auto json_column_placeholder = ::make_shared<column_identifier>("", true);
    auto prepared_json_value = _json_value->prepare(db, "", ::make_shared<column_specification>("", "", json_column_placeholder, utf8_type));
    prepared_json_value->collect_marker_specification(bound_names);
-    return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value));
+    return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value), _default_unset);
 }

 update_statement::update_statement(            ::shared_ptr<cf_name> name,
--- a/cql3/statements/update_statement.hh
+++ b/cql3/statements/update_statement.hh
@@ -82,9 +82,10 @@ private:
 */
 class insert_prepared_json_statement : public update_statement {
    ::shared_ptr<term> _term;
+    bool _default_unset;
 public:
-    insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t)
-        : update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t) {
+    insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t, bool default_unset)
+        : update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t), _default_unset(default_unset) {
        _restrictions = ::make_shared<restrictions::statement_restrictions>(s, false);
    }
 private:
--- a/database.cc
+++ b/database.cc
@@ -76,6 +76,8 @@
 #include "sstables/compaction_manager.hh"
 #include "sstables/compaction_backlog_manager.hh"
 #include "sstables/progress_monitor.hh"
+#include "auth/common.hh"
+#include "tracing/trace_keyspace_helper.hh"

 #include "checked-file-impl.hh"
 #include "disk-error-handler.hh"
@@ -178,6 +180,18 @@ bool is_system_keyspace(const sstring& name) {
    return system_keyspaces.find(name) != system_keyspaces.end();
 }

+static const std::unordered_set<sstring> internal_keyspaces = {
+        db::system_distributed_keyspace::NAME,
+        db::system_keyspace::NAME,
+        db::schema_tables::NAME,
+        auth::meta::AUTH_KS,
+        tracing::trace_keyspace_helper::KEYSPACE_NAME
+};
+
+bool is_internal_keyspace(const sstring& name) {
+    return internal_keyspaces.find(name) != internal_keyspaces.end();
+}
+
 // Used for tests where the CF exists without a database object. We need to pass a valid
 // dirty_memory manager in that case.
 thread_local dirty_memory_manager default_dirty_memory_manager;
@@ -684,9 +698,11 @@ table::make_reader(schema_ptr s,
    return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
 }

-sstables::shared_sstable
-table::make_streaming_sstable_for_write() {
+sstables::shared_sstable table::make_streaming_sstable_for_write(std::optional<sstring> subdir) {
    sstring dir = _config.datadir;
+    if (subdir) {
+        dir += "/" + *subdir;
+    }
    auto newtab = sstables::make_sstable(_schema,
            dir, calculate_generation_for_new_table(),
            get_highest_supported_format(),
@@ -826,7 +842,11 @@ void table::add_sstable(sstables::shared_sstable sstable, const std::vector<unsi
    new_sstables->insert(sstable);
    _sstables = std::move(new_sstables);
    update_stats_for_new_sstable(sstable->bytes_on_disk(), shards_for_the_sstable);
-    _compaction_strategy.get_backlog_tracker().add_sstable(sstable);
+    if (sstable->is_staging()) {
+        _sstables_staging.emplace(sstable->generation(), sstable);
+    } else {
+        _compaction_strategy.get_backlog_tracker().add_sstable(sstable);
+    }
 }

 future<>
@@ -1082,12 +1102,14 @@ table::start() {
 future<>
 table::stop() {
    return _async_gate.close().then([this] {
-        return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
-            return _compaction_manager.remove(this).then([this] {
-                // Nest, instead of using when_all, so we don't lose any exceptions.
-                return _streaming_flush_gate.close();
-            }).then([this] {
-                return _sstable_deletion_gate.close();
+        return when_all(await_pending_writes(), await_pending_reads()).discard_result().finally([this] {
+            return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
+                return _compaction_manager.remove(this).then([this] {
+                    // Nest, instead of using when_all, so we don't lose any exceptions.
+                    return _streaming_flush_gate.close();
+                }).then([this] {
+                    return _sstable_deletion_gate.close();
+                });
            });
        });
    });
@@ -1346,6 +1368,7 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new

    // This is done in the background, so we can consider this compaction completed.
    seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
+       return with_semaphore(_sstable_deletion_sem, 1, [this, sstables_to_remove = std::move(sstables_to_remove)] {
        return sstables::delete_atomically(sstables_to_remove, *get_large_partition_handler()).then_wrapped([this, sstables_to_remove] (future<> f) {
            std::exception_ptr eptr;
            try {
@@ -1369,6 +1392,7 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new
                return make_exception_future<>(eptr);
            }
            return make_ready_future<>();
+         });
        }).then([this] {
            // refresh underlying data source in row cache to prevent it from holding reference
            // to sstables files which were previously deleted.
@@ -1613,7 +1637,9 @@ std::vector<sstables::shared_sstable> table::select_sstables(const dht::partitio

 std::vector<sstables::shared_sstable> table::candidates_for_compaction() const {
    return boost::copy_range<std::vector<sstables::shared_sstable>>(*get_sstables()
-        | boost::adaptors::filtered([this] (auto& sst) { return !_sstables_need_rewrite.count(sst->generation()); }));
+            | boost::adaptors::filtered([this] (auto& sst) {
+        return !_sstables_need_rewrite.count(sst->generation()) && !_sstables_staging.count(sst->generation());
+    }));
 }

 std::vector<sstables::shared_sstable> table::sstables_need_rewrite() const {
@@ -1671,9 +1697,9 @@ future<> distributed_loader::open_sstable(distributed<database>& db, sstables::e
    // to distribute evenly the resource usage among all shards.

    return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
-            [&db, comps = std::move(comps), func = std::move(func), pc] (database& local) {
+            [&db, comps = std::move(comps), func = std::move(func), &pc] (database& local) {

-        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), pc] {
+        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), &pc] {
            auto& cf = local.find_column_family(comps.ks, comps.cf);

            auto f = sstables::sstable::load_shared_components(cf.schema(), comps.sstdir, comps.generation, comps.version, comps.format, pc);
@@ -1969,6 +1995,12 @@ future<sstables::entry_descriptor> distributed_loader::probe_file(distributed<da
    }
    auto cf_sstable_open = [sstdir, comps, fname] (column_family& cf, sstables::foreign_sstable_open_info info) {
        cf.update_sstables_known_generation(comps.generation);
+        if (shared_sstable sst = cf.get_staging_sstable(comps.generation)) {
+            dblog.warn("SSTable {} is already present in staging/ directory. Moving from staging will be retried.", sst->get_filename());
+            return seastar::async([sst = std::move(sst), comps = std::move(comps)] () {
+                sst->move_to_new_dir_in_thread(comps.sstdir, comps.generation);
+            });
+        }
        {
            auto i = boost::range::find_if(*cf._sstables->all(), [gen = comps.generation] (sstables::shared_sstable sst) { return sst->generation() == gen; });
            if (i != cf._sstables->all()->end()) {
@@ -2154,9 +2186,6 @@ database::database(const db::config& cfg, database_config dbcfg)
        [this] {
            ++_stats->sstable_read_queue_overloaded;
            return std::make_exception_ptr(std::runtime_error("sstable inactive read queue overloaded"));
-        },
-        [this] {
-            return _querier_cache.evict_one();
        })
    // No timeouts or queue length limits - a failure here can kill an entire repair.
    // Trust the caller to limit concurrency.
@@ -2168,12 +2197,11 @@ database::database(const db::config& cfg, database_config dbcfg)
    , _version(empty_version)
    , _compaction_manager(make_compaction_manager(*_cfg, dbcfg))
    , _enable_incremental_backups(cfg.incremental_backups())
-    , _querier_cache(dbcfg.available_memory * 0.04)
+    , _querier_cache(_read_concurrency_sem, dbcfg.available_memory * 0.04)
    , _large_partition_handler(std::make_unique<db::cql_table_large_partition_handler>(_cfg->compaction_large_partition_warning_threshold_mb()*1024*1024))
    , _result_memory_limiter(dbcfg.available_memory / 10)
 {
    local_schema_registry().init(*this); // TODO: we're never unbound.
-    _compaction_manager->start();
    setup_metrics();

    _row_cache_tracker.set_compaction_scheduling_group(dbcfg.memory_compaction_scheduling_group);
@@ -2299,6 +2327,9 @@ database::setup_metrics() {
                       sm::description("Counts sstables that survived the clustering key filtering. "
                                       "High value indicates that bloom filter is not very efficient and still have to access a lot of sstables to get data.")),

+        sm::make_derive("dropped_view_updates", _cf_stats.dropped_view_updates,
+                       sm::description("Counts the number of view updates that have been dropped due to cluster overload. ")),
+
        sm::make_derive("total_writes", _stats->total_writes,
                       sm::description("Counts the total number of successful write operations performed by this shard.")),

@@ -2316,6 +2347,9 @@ database::setup_metrics() {
                       sm::description("Counts the total number of failed read operations. "
                                       "Add the total_reads to this value to get the total amount of reads issued on this shard.")),

+        sm::make_current_bytes("view_update_backlog", [this] { return get_view_update_backlog().current; },
+                       sm::description("Holds the current size in bytes of the pending view updates for all tables")),
+
        sm::make_derive("querier_cache_lookups", _querier_cache.get_stats().lookups,
                       sm::description("Counts querier cache lookups (paging queries)")),

@@ -2420,6 +2454,9 @@ database::setup_metrics() {
 }

 database::~database() {
+    _read_concurrency_sem.clear_inactive_reads();
+    _streaming_concurrency_sem.clear_inactive_reads();
+    _system_read_concurrency_sem.clear_inactive_reads();
 }

 void database::update_version(const utils::UUID& version) {
@@ -2450,6 +2487,8 @@ future<> distributed_loader::populate_keyspace(distributed<database>& db, sstrin
                auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
                dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
                return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
+                    return distributed_loader::populate_column_family(db, sstdir + "/staging", ks_name, cfname);
+                }).then([&db, sstdir, uuid, ks_name, cfname] {
                    return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
                }).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
                    std::string msg =
@@ -2903,6 +2942,7 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
    cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics();
    cfg.large_partition_handler = lp_handler;
    cfg.view_update_concurrency_semaphore = _config.view_update_concurrency_semaphore;
+    cfg.view_update_concurrency_semaphore_limit = _config.view_update_concurrency_semaphore_limit;

    return cfg;
 }
@@ -2930,6 +2970,7 @@ keyspace::make_directory_for_column_family(const sstring& name, utils::UUID uuid
            io_check(recursive_touch_directory, cfdir).get();
        }
        io_check(touch_directory, cfdirs[0] + "/upload").get();
+        io_check(touch_directory, cfdirs[0] + "/staging").get();
    });
 }

@@ -3699,6 +3740,7 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
    cfg.enable_metrics_reporting = _cfg->enable_keyspace_column_family_metrics();

    cfg.view_update_concurrency_semaphore = &_view_update_concurrency_sem;
+    cfg.view_update_concurrency_semaphore_limit = max_memory_pending_view_updates();
    return cfg;
 }

@@ -3796,6 +3838,8 @@ database::stop() {
        return parallel_for_each(_column_families, [this] (auto& val_pair) {
            return val_pair.second->stop();
        });
+    }).then([this] {
+        return _view_update_concurrency_sem.wait(max_memory_pending_view_updates());
    }).then([this] {
        if (_commitlog != nullptr) {
            return _commitlog->release();
@@ -4051,6 +4095,7 @@ seal_snapshot(sstring jsondir) {

 future<> table::snapshot(sstring name) {
    return flush().then([this, name = std::move(name)]() {
+       return with_semaphore(_sstable_deletion_sem, 1, [this, name = std::move(name)]() {
        auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
        return do_with(std::move(tables), [this, name](std::vector<sstables::shared_sstable> & tables) {
            auto jsondir = _config.datadir + "/snapshots/" + name;
@@ -4110,6 +4155,7 @@ future<> table::snapshot(sstring name) {
                });
            });
        });
+       });
    });
 }

@@ -4239,6 +4285,7 @@ future<> table::fail_streaming_mutations(utils::UUID plan_id) {
    _streaming_memtables_big.erase(it);
    return entry->flush_in_progress.close().then([this, entry] {
        for (auto&& sst : entry->sstables) {
+            sst.monitor->write_failed();
            sst.sstable->mark_for_deletion();
        }
    });
@@ -4417,6 +4464,14 @@ std::vector<view_ptr> table::affected_views(const schema_ptr& base, const mutati
    }));
 }

+static size_t memory_usage_of(const std::vector<frozen_mutation_and_schema>& ms) {
+    // Overhead of sending a view mutation, in terms of data structures used by the storage_proxy.
+    constexpr size_t base_overhead_bytes = 256;
+    return boost::accumulate(ms | boost::adaptors::transformed([] (const frozen_mutation_and_schema& m) {
+        return m.fm.representation().size();
+    }), size_t{base_overhead_bytes * ms.size()});
+}
+
 /**
 * Given some updates on the base table and the existing values for the rows affected by that update, generates the
 * mutations to be applied to the base table's views, and sends them to the paired view replicas.
@@ -4433,75 +4488,15 @@ std::vector<view_ptr> table::affected_views(const schema_ptr& base, const mutati
 future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
        std::vector<view_ptr>&& views,
        mutation&& m,
-        flat_mutation_reader_opt existings,
-        db::timeout_clock::time_point timeout) const {
+        flat_mutation_reader_opt existings) const {
    auto base_token = m.token();
-    return db::view::generate_view_updates(base,
-                        std::move(views),
-                        flat_mutation_reader_from_mutations({std::move(m)}),
-                        std::move(existings)).then([this, timeout, base_token = std::move(base_token)] (auto&& updates) mutable {
-        return seastar::get_units(*_config.view_update_concurrency_semaphore, 1, timeout).then(
-                [this, base_token = std::move(base_token), updates = std::move(updates)] (auto units) mutable {
-            db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats).handle_exception([units = std::move(units)] (auto ignored) { });
-        });
-    });
-}
-
-/**
- * Given an update for the base table, calculates the set of potentially affected views,
- * generates the relevant updates, and sends them to the paired view replicas.
- */
-future<row_locker::lock_holder> table::push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const {
-    //FIXME: Avoid unfreezing here.
-    auto m = fm.unfreeze(s);
-    auto& base = schema();
-    m.upgrade(base);
-    auto views = affected_views(base, m);
-    if (views.empty()) {
-        return make_ready_future<row_locker::lock_holder>();
-    }
-    auto cr_ranges = db::view::calculate_affected_clustering_ranges(*base, m.decorated_key(), m.partition(), views);
-    if (cr_ranges.empty()) {
-        return generate_and_propagate_view_updates(base, std::move(views), std::move(m), { }, timeout).then([] {
-                // In this case we are not doing a read-before-write, just a
-                // write, so no lock is needed.
-                return make_ready_future<row_locker::lock_holder>();
-        });
-    }
-    // We read the whole set of regular columns in case the update now causes a base row to pass
-    // a view's filters, and a view happens to include columns that have no value in this update.
-    // Also, one of those columns can determine the lifetime of the base row, if it has a TTL.
-    auto columns = boost::copy_range<std::vector<column_id>>(
-            base->regular_columns() | boost::adaptors::transformed(std::mem_fn(&column_definition::id)));
-    query::partition_slice::option_set opts;
-    opts.set(query::partition_slice::option::send_partition_key);
-    opts.set(query::partition_slice::option::send_clustering_key);
-    opts.set(query::partition_slice::option::send_timestamp);
-    opts.set(query::partition_slice::option::send_ttl);
-    auto slice = query::partition_slice(
-            std::move(cr_ranges), { }, std::move(columns), std::move(opts), { }, cql_serialization_format::internal(), query::max_rows);
-    // Take the shard-local lock on the base-table row or partition as needed.
-    // We'll return this lock to the caller, which will release it after
-    // writing the base-table update.
-    future<row_locker::lock_holder> lockf = local_base_lock(base, m.decorated_key(), slice.default_row_ranges(), timeout);
-    return lockf.then([m = std::move(m), slice = std::move(slice), views = std::move(views), base, this, timeout] (row_locker::lock_holder lock) {
-      return do_with(
-        dht::partition_range::make_singular(m.decorated_key()),
-        std::move(slice),
-        std::move(m),
-        [base, views = std::move(views), lock = std::move(lock), this, timeout] (auto& pk, auto& slice, auto& m) mutable {
-            auto reader = this->make_reader(
-                base,
-                pk,
-                slice,
-                service::get_local_sstable_query_read_priority());
-            return this->generate_and_propagate_view_updates(base, std::move(views), std::move(m), std::move(reader), timeout).then([lock = std::move(lock)] () mutable {
-                // return the local partition/row lock we have taken so it
-                // remains locked until the caller is done modifying this
-                // partition/row and destroys the lock object.
-                return std::move(lock);
-            });
-      });
+    return db::view::generate_view_updates(
+            base,
+            std::move(views),
+            flat_mutation_reader_from_mutations({std::move(m)}),
+            std::move(existings)).then([this, base_token = std::move(base_token)] (std::vector<frozen_mutation_and_schema>&& updates) mutable {
+        auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(updates));
+        db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats, std::move(units)).handle_exception([] (auto ignored) { });
    });
 }

@@ -4606,8 +4601,17 @@ future<> table::populate_views(
            schema,
            std::move(views),
            std::move(reader),
-            { }).then([base_token = std::move(base_token), this] (auto&& updates) {
-        return db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats);
+            { }).then([base_token = std::move(base_token), this] (std::vector<frozen_mutation_and_schema>&& updates) mutable {
+        size_t update_size = memory_usage_of(updates);
+        size_t units_to_wait_for = std::min(_config.view_update_concurrency_semaphore_limit, update_size);
+        return seastar::get_units(*_config.view_update_concurrency_semaphore, units_to_wait_for).then(
+                [base_token = std::move(base_token),
+                 updates = std::move(updates),
+                 units_to_consume = update_size - units_to_wait_for,
+                 this] (db::timeout_semaphore_units&& units) mutable {
+            units.adopt(seastar::consume_units(*_config.view_update_concurrency_semaphore, units_to_consume));
+            return db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats, std::move(units));
+        });
    });
 }

--- a/database.hh
+++ b/database.hh
@@ -77,6 +77,7 @@
 #include <seastar/core/metrics_registration.hh>
 #include "tracing/trace_state.hh"
 #include "db/view/view.hh"
+#include "db/view/view_update_backlog.hh"
 #include "db/view/row_locking.hh"
 #include "lister.hh"
 #include "utils/phased_barrier.hh"
@@ -279,6 +280,9 @@ struct cf_stats {
    int64_t clustering_filter_fast_path_count = 0;
    // how many sstables survived the clustering key checks
    int64_t surviving_sstables_after_clustering_filter = 0;
+
+    // How many view updates were dropped due to overload.
+    int64_t dropped_view_updates = 0;
 };

 class cache_temperature {
@@ -298,6 +302,8 @@ public:
 class table;
 using column_family = table;

+class database_sstable_write_monitor;
+
 class table : public enable_lw_shared_from_this<table> {
 public:
    struct config {
@@ -323,6 +329,7 @@ public:
        bool enable_metrics_reporting = false;
        db::large_partition_handler* large_partition_handler;
        db::timeout_semaphore* view_update_concurrency_semaphore;
+        size_t view_update_concurrency_semaphore_limit;
    };
    struct no_commitlog {};
    struct stats {
@@ -395,7 +402,7 @@ private:
    // plan memtables and the resulting sstables are not made visible until
    // the streaming is complete.
    struct monitored_sstable {
-        std::unique_ptr<sstables::write_monitor> monitor;
+        std::unique_ptr<database_sstable_write_monitor> monitor;
        sstables::shared_sstable sstable;
    };

@@ -432,8 +439,15 @@ private:
    // but for correct compaction we need to start the compaction only after
    // reading all sstables.
    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
+    // sstables that should not be compacted (e.g. because they need to be used
+    // to generate view updates later)
+    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_staging;
    // Control background fibers waiting for sstables to be deleted
    seastar::gate _sstable_deletion_gate;
+    // This semaphore ensures that an operation like snapshot won't have its selected
+    // sstables deleted by compaction in parallel, a race condition which could
+    // easily result in failure.
+    seastar::semaphore _sstable_deletion_sem = {1};
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
    rwlock _sstables_lock;
@@ -485,6 +499,11 @@ private:
    utils::phased_barrier _pending_reads_phaser;
 public:
    future<> add_sstable_and_update_cache(sstables::shared_sstable sst);
+    void move_sstable_from_staging_in_thread(sstables::shared_sstable sst);
+    sstables::shared_sstable get_staging_sstable(uint64_t generation) {
+        auto it = _sstables_staging.find(generation);
+        return it != _sstables_staging.end() ? it->second : nullptr;
+    }
 private:
    void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
    // Adds new sstable to the set of sstables
@@ -618,6 +637,14 @@ public:
            tracing::trace_state_ptr trace_state = nullptr,
            streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
+    flat_mutation_reader make_reader_excluding_sstable(schema_ptr schema,
+            sstables::shared_sstable sst,
+            const dht::partition_range& range,
+            const query::partition_slice& slice,
+            const io_priority_class& pc = default_priority_class(),
+            tracing::trace_state_ptr trace_state = nullptr,
+            streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
+            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;

    flat_mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
        auto& full_slice = schema->full_slice();
@@ -632,9 +659,13 @@ public:
    flat_mutation_reader make_streaming_reader(schema_ptr schema,
            const dht::partition_range_vector& ranges) const;

-    sstables::shared_sstable make_streaming_sstable_for_write();
+    sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
+    sstables::shared_sstable make_streaming_staging_sstable() {
+        return make_streaming_sstable_for_write("staging");
+    }

    mutation_source as_mutation_source() const;
+    mutation_source as_mutation_source_excluding(sstables::shared_sstable sst) const;

    void set_virtual_reader(mutation_source virtual_reader) {
        _virtual_reader = std::move(virtual_reader);
@@ -842,6 +873,8 @@ public:
    void clear_views();
    const std::vector<view_ptr>& views() const;
    future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const;
+    future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout) const;
+    future<row_locker::lock_holder> stream_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, sstables::shared_sstable excluded_sstable) const;
    void add_coordinator_read_latency(utils::estimated_histogram::duration latency);
    std::chrono::milliseconds get_coordinator_read_latency_percentile(double percentile);

@@ -859,13 +892,17 @@ public:
            dht::token base_token,
            flat_mutation_reader&&);

+    reader_concurrency_semaphore& read_concurrency_semaphore() {
+        return *_config.read_concurrency_semaphore;
+    }
+
 private:
+    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
            std::vector<view_ptr>&& views,
            mutation&& m,
-            flat_mutation_reader_opt existings,
-            db::timeout_clock::time_point timeout) const;
+            flat_mutation_reader_opt existings) const;

    mutable row_locker _row_locker;
    future<row_locker::lock_holder> local_base_lock(
@@ -1055,6 +1092,7 @@ public:
        seastar::scheduling_group streaming_scheduling_group;
        bool enable_metrics_reporting = false;
        db::timeout_semaphore* view_update_concurrency_semaphore = nullptr;
+        size_t view_update_concurrency_semaphore_limit;
    };
 private:
    std::unique_ptr<locator::abstract_replication_strategy> _replication_strategy;
@@ -1156,6 +1194,7 @@ private:
    static const size_t max_count_system_concurrent_reads{10};
    size_t max_memory_system_concurrent_reads() { return _dbcfg.available_memory * 0.02; };
    static constexpr size_t max_concurrent_sstable_loads() { return 3; }
+    size_t max_memory_pending_view_updates() const { return _dbcfg.available_memory * 0.1; }

    struct db_stats {
        uint64_t total_writes = 0;
@@ -1192,7 +1231,7 @@ private:

    semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};

-    db::timeout_semaphore _view_update_concurrency_sem{100}; // Stand-in hack for #2538
+    db::timeout_semaphore _view_update_concurrency_sem{max_memory_pending_view_updates()};

    cache_tracker _row_cache_tracker;

@@ -1399,6 +1438,12 @@ public:
    std::unordered_set<sstring> get_initial_tokens();
    std::experimental::optional<gms::inet_address> get_replace_address();
    bool is_replacing();
+    reader_concurrency_semaphore& user_read_concurrency_sem() {
+        return _read_concurrency_sem;
+    }
+    reader_concurrency_semaphore& streaming_read_concurrency_sem() {
+        return _streaming_concurrency_sem;
+    }
    reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
        return _system_read_concurrency_sem;
    }
@@ -1423,11 +1468,17 @@ public:
        return _querier_cache;
    }

+    db::view::update_backlog get_view_update_backlog() const {
+        return {max_memory_pending_view_updates() - _view_update_concurrency_sem.current(), max_memory_pending_view_updates()};
+    }
+
    friend class distributed_loader;
 };

 future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy);

+bool is_internal_keyspace(const sstring& name);
+
 class distributed_loader {
 public:
    static void reshard(distributed<database>& db, sstring ks_name, sstring cf_name);
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -689,6 +689,8 @@ public:
        // but all previous write/flush pairs.
        return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable { ///////////////////////////////////////////////////
            auto view = fragmented_temporary_buffer::view(buf);
+            view.remove_suffix(buf.size_bytes() - size);
+            assert(size == view.size_bytes());
            return do_with(off, view, [&] (uint64_t& off, fragmented_temporary_buffer::view& view) {
                if (view.empty()) {
                    return make_ready_future<>();
@@ -1673,14 +1675,14 @@ const db::commitlog::config& db::commitlog::active_config() const {
 // No commit_io_check needed in the log reader since the database will fail
 // on error at startup if required
 future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
-db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off, const db::extensions* exts) {
+db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class read_io_prio_class, commit_load_reader_func next, position_type off, const db::extensions* exts) {
    struct work {
    private:
-        file_input_stream_options make_file_input_stream_options() {
+        file_input_stream_options make_file_input_stream_options(seastar::io_priority_class read_io_prio_class) {
            file_input_stream_options fo;
            fo.buffer_size = db::commitlog::segment::default_size;
            fo.read_ahead = 10;
-            fo.io_priority_class = service::get_local_commitlog_priority();
+            fo.io_priority_class = read_io_prio_class;
            return fo;
        }
    public:
@@ -1699,8 +1701,8 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        bool header = true;
        bool failed = false;

-        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
+        work(file f, seastar::io_priority_class read_io_prio_class, position_type o = 0)
+                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options(read_io_prio_class))), start_off(o) {
        }
        work(work&&) = default;

@@ -1918,9 +1920,9 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        return fut;
    });

-    return fut.then([off, next](file f) {
+    return fut.then([off, next, read_io_prio_class] (file f) {
        f = make_checked_file(commit_error_handler, std::move(f));
-        auto w = make_lw_shared<work>(std::move(f), off);
+        auto w = make_lw_shared<work>(std::move(f), read_io_prio_class, off);
        auto ret = w->s.listen(next);

        w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -355,7 +355,7 @@ public:
    };

    static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
-            const sstring&, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
+            const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
 private:
    commitlog(config);

--- a/db/commitlog/commitlog_entry.hh
+++ b/db/commitlog/commitlog_entry.hh
@@ -34,7 +34,8 @@ public:
    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
        : _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
    const stdx::optional<column_mapping>& mapping() const { return _mapping; }
-    const frozen_mutation& mutation() const { return _mutation; }
+    const frozen_mutation& mutation() const & { return _mutation; }
+    frozen_mutation&& mutation() && { return std::move(_mutation); }
 };

 class commitlog_entry_writer {
@@ -80,5 +81,6 @@ public:
    commitlog_entry_reader(const temporary_buffer<char>& buffer);

    const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
-    const frozen_mutation& mutation() const { return _ce.mutation(); }
+    const frozen_mutation& mutation() const & { return _ce.mutation(); }
+    frozen_mutation&& mutation() && { return std::move(_ce).mutation(); }
 };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -58,6 +58,7 @@
 #include "converting_mutation_partition_applier.hh"
 #include "schema_registry.hh"
 #include "commitlog_entry.hh"
+#include "service/priority_manager.hh"

 static logging::logger rlogger("commitlog_replayer");

@@ -163,7 +164,7 @@ future<> db::commitlog_replayer::impl::init() {
                // Get all truncation records for the CF and initialize max rps if
                // present. Cannot do this on demand, as there may be no sstables to
                // mark the CF as "needed".
-                return db::system_keyspace::get_truncated_position(uuid).then([&map, &uuid](std::vector<db::replay_position> tpps) {
+                return db::system_keyspace::get_truncated_position(uuid).then([&map, uuid](std::vector<db::replay_position> tpps) {
                    for (auto& p : tpps) {
                        rlogger.trace("CF {} truncated at {}", uuid, p);
                        auto& pp = map[p.shard_id()][uuid];
@@ -223,7 +224,7 @@ db::commitlog_replayer::impl::recover(sstring file, const sstring& fname_prefix)
    auto s = make_lw_shared<stats>();
    auto& exts = _qp.local().db().local().get_config().extensions();

-    return db::commitlog::read_log_file(file,
+    return db::commitlog::read_log_file(file, service::get_local_commitlog_priority(),
            std::bind(&impl::process, this, s.get(), std::placeholders::_1,
                    std::placeholders::_2), p, &exts).then([](auto s) {
        auto f = s->done();
--- a/db/config.cc
+++ b/db/config.cc
@@ -102,6 +102,8 @@ db::config::config()
 db::config::~config()
 {}

+const sstring db::config::default_tls_priority("SECURE128:-VERS-TLS1.0");
+
 namespace utils {

 template<>
--- a/db/config.hh
+++ b/db/config.hh
@@ -155,6 +155,9 @@ public:
    val(hints_directory, sstring, "/var/lib/scylla/hints", Used,   \
            "The directory where hints files are stored if hinted handoff is enabled."   \
    )                                           \
+    val(view_hints_directory, sstring, "/var/lib/scylla/view_hints", Used,   \
+            "The directory where materialized-view updates are stored while a view replica is unreachable."   \
+    )                                           \
    val(saved_caches_directory, sstring, "/var/lib/scylla/saved_caches", Unused, \
            "The directory location where table key and row caches are stored."  \
    )                                                   \
@@ -453,7 +456,7 @@ public:
            "The maximum number of tombstones a query can scan before aborting."  \
    )   \
    /* Network timeout settings */  \
-    val(range_request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(range_request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The time in milliseconds that the coordinator waits for sequential or index scans to complete."  \
    )   \
    val(read_request_timeout_in_ms, uint32_t, 5000, Used,     \
@@ -472,7 +475,7 @@ public:
            "The time in milliseconds that the coordinator waits for write operations to complete.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
-    val(request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The default timeout for other, miscellaneous operations.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
@@ -578,7 +581,7 @@ public:
    val(dynamic_snitch_update_interval_in_ms, uint32_t, 100, Unused,     \
            "The time interval for how often the snitch calculates node scores. Because score calculation is CPU intensive, be careful when reducing this interval."  \
    )   \
-    val(hinted_handoff_enabled, sstring, "false", Used,     \
+    val(hinted_handoff_enabled, sstring, "true", Used,     \
            "Enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
            "Related information: About hinted handoff writes"  \
    )   \
@@ -621,7 +624,7 @@ public:
    val(thrift_framed_transport_size_in_mb, uint32_t, 15, Unused,     \
            "Frame size (maximum field length) for Thrift. The frame is the row or part of the row the application is inserting."  \
    )   \
-    val(thrift_max_message_length_in_mb, uint32_t, 16, Unused,     \
+    val(thrift_max_message_length_in_mb, uint32_t, 16, Used,     \
            "The maximum length of a Thrift message in megabytes, including all fields and internal Thrift overhead (1 byte of overhead for each frame). Message length is usually used in conjunction with batches. A frame length greater than or equal to 24 accommodates a batch with four inserts, each of which is 24 bytes. The required message length is greater than or equal to 24+24+24+24+4 (number of frames)."  \
    )   \
    /* Security properties */   \
@@ -739,7 +742,7 @@ public:
        " Performance is affected to some extent as a result. Useful to help debugging problems that may arise at another layers.") \
    val(cpu_scheduler, bool, true, Used, "Enable cpu scheduling") \
    val(view_building, bool, true, Used, "Enable view building; should only be set to false when the node is experience issues due to view building") \
-    val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format; FOR TESTING PURPOSES ONLY - TO BE REMOVED BEFORE RELEASE") \
+    val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
@@ -753,6 +756,8 @@ public:
    add_options(boost::program_options::options_description_easy_init&);

    const db::extensions& extensions() const;
+
+    static const sstring default_tls_priority;
 private:
    template<typename T>
    struct log_legacy_value : public named_value<T, value_status::Used> {
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -35,6 +35,7 @@
 #include "disk-error-handler.hh"
 #include "lister.hh"
 #include "db/timeout_clock.hh"
+#include "service/priority_manager.hh"

 using namespace std::literals::chrono_literals;

@@ -78,6 +79,9 @@ void manager::register_metrics(const sstring& group_name) {

        sm::make_derive("sent", _stats.sent,
                        sm::description("Number of sent hints.")),
+
+        sm::make_derive("discarded", _stats.discarded,
+                        sm::description("Number of hints that were discarded during sending (too old, schema changed, etc.).")),
    });
 }

@@ -95,6 +99,7 @@ future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr
        return compute_hints_dir_device_id();
    }).then([this] {
        _strorage_service_anchor->register_subscriber(this);
+        set_started();
    });
 }

@@ -105,7 +110,7 @@ future<> manager::stop() {
        _strorage_service_anchor->unregister_subscriber(this);
    }

-    _stopping = true;
+    set_stopping();

    return _draining_eps_gate.close().finally([this] {
        return parallel_for_each(_ep_managers, [] (auto& pair) {
@@ -277,7 +282,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
 }

 bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
-    if (_stopping || !can_hint_for(ep)) {
+    if (stopping() || !started() || !can_hint_for(ep)) {
        manager_logger.trace("Can't store a hint to {}", ep);
        ++_stats.dropped;
        return false;
@@ -380,7 +385,7 @@ future<timespec> manager::end_point_hints_manager::sender::get_last_file_modific
    });
 }

-future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
+future<> manager::end_point_hints_manager::sender::do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
    return futurize_apply([this, m = std::move(m), &natural_endpoints] () mutable -> future<> {
        // The fact that we send with CL::ALL in both cases below ensures that new hints are not going
        // to be generated as a result of hints sending.
@@ -392,7 +397,8 @@ future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation
            // FIXME: using 1h as infinite timeout. If a node is down, we should get an
            // unavailable exception.
            auto timeout = db::timeout_clock::now() + 1h;
-            return _proxy.mutate({std::move(m)}, consistency_level::ALL, timeout, nullptr);
+            //FIXME: Add required frozen_mutation overloads
+            return _proxy.mutate({m.fm.unfreeze(m.s)}, consistency_level::ALL, timeout, nullptr);
        }
    });
 }
@@ -418,21 +424,19 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {
    }
 }

-mutation manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
+frozen_mutation_and_schema manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
    hint_entry_reader hr(buf);
    auto& fm = hr.mutation();
    auto& cm = get_column_mapping(std::move(ctx_ptr), fm, hr);
-    auto& cf = _db.find_column_family(fm.column_family_id());
+    auto schema = _db.find_schema(fm.column_family_id());

-    if (cf.schema()->version() != fm.schema_version()) {
-        mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
-        converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
+    if (schema->version() != fm.schema_version()) {
+        mutation m(schema, fm.decorated_key(*schema));
+        converting_mutation_partition_applier v(cm, *schema, m.partition());
        fm.partition().accept(cm, v);
-
-        return std::move(m);
-    } else {
-        return fm.unfreeze(cf.schema());
+        return {freeze(m), std::move(schema)};
    }
+    return {std::move(hr).mutation(), std::move(schema)};
 }

 const column_mapping& manager::end_point_hints_manager::sender::get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr) {
@@ -502,7 +506,7 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
 }

 void manager::drain_for(gms::inet_address endpoint) {
-    if (_stopping) {
+    if (stopping()) {
        return;
    }

@@ -543,6 +547,7 @@ manager::end_point_hints_manager::sender::sender(end_point_hints_manager& parent
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(local_storage_proxy)
    , _db(local_db)
+    , _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
    , _gossiper(local_gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -555,6 +560,7 @@ manager::end_point_hints_manager::sender::sender(const sender& other, end_point_
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(other._proxy)
    , _db(other._db)
+    , _hints_cpu_sched_group(other._hints_cpu_sched_group)
    , _gossiper(other._gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -610,7 +616,10 @@ manager::end_point_hints_manager::sender::clock::duration manager::end_point_hin
 }

 void manager::end_point_hints_manager::sender::start() {
-    _stopped = seastar::async([this] {
+    seastar::thread_attributes attr;
+
+    attr.sched_group = _hints_cpu_sched_group;
+    _stopped = seastar::async(std::move(attr), [this] {
        manager_logger.trace("ep_manager({})::sender: started", end_point_key());
        while (!stopping()) {
            try {
@@ -630,10 +639,11 @@ void manager::end_point_hints_manager::sender::start() {
    });
 }

-future<> manager::end_point_hints_manager::sender::send_one_mutation(mutation m) {
-    keyspace& ks = _db.find_keyspace(m.schema()->ks_name());
+future<> manager::end_point_hints_manager::sender::send_one_mutation(frozen_mutation_and_schema m) {
+    keyspace& ks = _db.find_keyspace(m.s->ks_name());
    auto& rs = ks.get_replication_strategy();
-    std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(m.token());
+    auto token = dht::global_partitioner().get_token(*m.s, m.fm.key(*m.s));
+    std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(std::move(token));

    return do_send_one_mutation(std::move(m), natural_endpoints);
 }
@@ -651,8 +661,8 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
                    return make_ready_future<>();
                }

-                mutation m = this->get_mutation(ctx_ptr, buf);
-                gc_clock::duration gc_grace_sec = m.schema()->gc_grace_seconds();
+                auto m = this->get_mutation(ctx_ptr, buf);
+                gc_clock::duration gc_grace_sec = m.s->gc_grace_seconds();

                // The hint is too old - drop it.
                //
@@ -673,10 +683,13 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
            // ignore these errors and move on - probably this hint is too old and the KS/CF has been deleted...
            } catch (no_such_column_family& e) {
                manager_logger.debug("send_hints(): no_such_column_family: {}", e.what());
+                ++this->shard_stats().discarded;
            } catch (no_such_keyspace& e) {
                manager_logger.debug("send_hints(): no_such_keyspace: {}", e.what());
+                ++this->shard_stats().discarded;
            } catch (no_column_mapping& e) {
-                manager_logger.debug("send_hints(): {}: {}", fname, e.what());
+                manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
+                ++this->shard_stats().discarded;
            }
            return make_ready_future<>();
        }).finally([units = std::move(units), ctx_ptr] {});
@@ -690,10 +703,10 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
 bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fname) {
    timespec last_mod = get_last_file_modification(fname).get0();
    gc_clock::duration secs_since_file_mod = std::chrono::seconds(last_mod.tv_sec);
-    lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>();
+    lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>(_last_schema_ver_to_column_mapping);

    try {
-        auto s = commitlog::read_log_file(fname, [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
+        auto s = commitlog::read_log_file(fname, service::get_local_streaming_read_priority(), [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
            // Check that we can still send the next hint. Don't try to send it if the destination host
            // is DOWN or if we have already failed to send some of the previous hints.
            if (!draining() && ctx_ptr->state.contains(send_state::segment_replay_failed)) {
@@ -747,6 +760,7 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam

    // clear the replay position - we are going to send the next segment...
    _last_not_complete_rp = replay_position();
+    _last_schema_ver_to_column_mapping.clear();
    manager_logger.trace("send_one_file(): segment {} was sent in full and deleted", fname);
    return true;
 }
@@ -759,7 +773,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    int replayed_segments_count = 0;

    try {
-        while (have_segments()) {
+        while (replay_allowed() && have_segments()) {
            if (!send_one_file(*_segments_to_replay.begin())) {
                break;
            }
@@ -784,14 +798,24 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
 }

+template<typename Func>
+static future<> scan_for_hints_dirs(const sstring& hints_directory, Func&& f) {
+    return lister::scan_dir(hints_directory, { directory_entry_type::directory }, [f = std::forward<Func>(f)] (lister::path dir, directory_entry de) {
+        try {
+            return f(std::move(dir), std::move(de), std::stoi(de.name.c_str()));
+        } catch (std::invalid_argument& ex) {
+            manager_logger.debug("Ignore invalid directory {}", de.name);
+            return make_ready_future<>();
+        }
+    });
+}
+
 // runs in seastar::async context
 manager::hints_segments_map manager::get_current_hints_segments(const sstring& hints_directory) {
    hints_segments_map current_hints_segments;

    // shards level
-    lister::scan_dir(hints_directory, { directory_entry_type::directory }, [&current_hints_segments] (lister::path dir, directory_entry de) {
-        unsigned shard_id = std::stoi(de.name.c_str());
-
+    scan_for_hints_dirs(hints_directory, [&current_hints_segments] (lister::path dir, directory_entry de, unsigned shard_id) {
        manager_logger.trace("shard_id = {}", shard_id);
        // IPs level
        return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory }, [&current_hints_segments, shard_id] (lister::path dir, directory_entry de) {
@@ -908,9 +932,7 @@ void manager::rebalance_segments_for(
 // runs in seastar::async context
 void manager::remove_irrelevant_shards_directories(const sstring& hints_directory) {
    // shards level
-    lister::scan_dir(hints_directory, { directory_entry_type::directory }, [] (lister::path dir, directory_entry de) {
-        unsigned shard_id = std::stoi(de.name.c_str());
-
+    scan_for_hints_dirs(hints_directory, [] (lister::path dir, directory_entry de, unsigned shard_id) {
        if (shard_id >= smp::count) {
            // IPs level
            return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory, directory_entry_type::regular }, lister::show_hidden::yes, [] (lister::path dir, directory_entry de) {
@@ -936,5 +958,15 @@ future<> manager::rebalance(sstring hints_directory) {
    });
 }

+void manager::update_backlog(size_t backlog, size_t max_backlog) {
+    _backlog_size = backlog;
+    _max_backlog_size = max_backlog;
+    if (backlog < max_backlog) {
+        allow_hints();
+    } else {
+        forbid_hints_for_eps_with_pending_hints();
+    }
+}
+
 }
 }
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -59,6 +59,7 @@ private:
        uint64_t errors = 0;
        uint64_t dropped = 0;
        uint64_t sent = 0;
+        uint64_t discarded = 0;
    };

    // map: shard -> segments
@@ -69,6 +70,8 @@ private:
    class drain_tag {};
    using drain = seastar::bool_class<drain_tag>;

+    friend class space_watchdog;
+
 public:
    class end_point_hints_manager {
    public:
@@ -100,7 +103,10 @@ public:
                send_state::restart_segment>>;

            struct send_one_file_ctx {
-                std::unordered_map<table_schema_version, column_mapping> schema_ver_to_column_mapping;
+                send_one_file_ctx(std::unordered_map<table_schema_version, column_mapping>& last_schema_ver_to_column_mapping)
+                    : schema_ver_to_column_mapping(last_schema_ver_to_column_mapping)
+                {}
+                std::unordered_map<table_schema_version, column_mapping>& schema_ver_to_column_mapping;
                seastar::gate file_send_gate;
                std::unordered_set<db::replay_position> rps_set; // number of elements in this set is never going to be greater than the maximum send queue length
                send_state_set state;
@@ -109,6 +115,7 @@ public:
        private:
            std::list<sstring> _segments_to_replay;
            replay_position _last_not_complete_rp;
+            std::unordered_map<table_schema_version, column_mapping> _last_schema_ver_to_column_mapping;
            state_set _state;
            future<> _stopped;
            clock::time_point _next_flush_tp;
@@ -119,6 +126,7 @@ public:
            resource_manager& _resource_manager;
            service::storage_proxy& _proxy;
            database& _db;
+            seastar::scheduling_group _hints_cpu_sched_group;
            gms::gossiper& _gossiper;
            seastar::shared_mutex& _file_update_mutex;

@@ -179,6 +187,10 @@ public:
                return _state.contains(state::stopping);
            }

+            bool replay_allowed() const noexcept {
+                return _ep_manager.replay_allowed();
+            }
+
            /// \brief Try to send one hint read from the file.
            ///  - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
            ///  - Discard the hints that are older than the grace seconds value of the corresponding table.
@@ -210,7 +222,7 @@ public:
            /// \param ctx_ptr pointer to the send context
            /// \param buf hints file entry
            /// \return The mutation object representing the original mutation stored in the hints file.
-            mutation get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
+            frozen_mutation_and_schema get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);

            /// \brief Get a reference to the column_mapping object for a given frozen mutation.
            /// \param ctx_ptr pointer to the send context
@@ -227,13 +239,13 @@ public:
            /// \param m mutation to send
            /// \param natural_endpoints current replicas for the given mutation
            /// \return future that resolves when the operation is complete
-            future<> do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
+            future<> do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;

            /// \brief Send one mutation out.
            ///
            /// \param m mutation to send
            /// \return future that resolves when the mutation sending processing is complete.
-            future<> send_one_mutation(mutation m);
+            future<> send_one_mutation(frozen_mutation_and_schema m);

            /// \brief Get the last modification time stamp for a given file.
            /// \param fname File name
@@ -328,6 +340,10 @@ public:
            return _hints_in_progress;
        }

+        bool replay_allowed() const noexcept {
+            return _shard_manager.replay_allowed();
+        }
+
        bool can_hint() const noexcept {
            return _state.contains(state::can_hint);
        }
@@ -393,6 +409,17 @@ public:
        }
    };

+    enum class state {
+        started,                // hinting is currently allowed (start() call is complete)
+        replay_allowed,         // replaying (hints sending) is allowed
+        stopping                // hinting is not allowed - stopping is in progress (stop() method has been called)
+    };
+
+    using state_set = enum_set<super_enum<state,
+        state::started,
+        state::replay_allowed,
+        state::stopping>>;
+
 private:
    using ep_key_type = typename end_point_hints_manager::key_type;
    using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;
@@ -403,6 +430,7 @@ public:
    static const std::chrono::seconds hint_file_write_timeout;

 private:
+    state_set _state;
    const boost::filesystem::path _hints_dir;
    dev_t _hints_dir_device_id = 0;

@@ -414,7 +442,7 @@ private:
    locator::snitch_ptr& _local_snitch_ptr;
    int64_t _max_hint_window_us = 0;
    database& _local_db;
-    bool _stopping = false;
+
    seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call

    resource_manager& _resource_manager;
@@ -424,9 +452,14 @@ private:
    seastar::metrics::metric_groups _metrics;
    std::unordered_set<ep_key_type> _eps_with_pending_hints;

+    size_t _max_backlog_size = 1;
+    size_t _backlog_size = 0;
+
 public:
    manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
    virtual ~manager();
+    manager(manager&&) = delete;
+    manager& operator=(manager&&) = delete;
    void register_metrics(const sstring& group_name);
    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
    future<> stop();
@@ -503,6 +536,18 @@ public:
    void forbid_hints();
    void forbid_hints_for_eps_with_pending_hints();

+    size_t max_backlog_size() const {
+        return _max_backlog_size;
+    }
+
+    size_t backlog_size() const {
+        return _backlog_size;
+    }
+
+    void allow_replaying() noexcept {
+        _state.set(state::replay_allowed);
+    }
+
    /// \brief Rebalance hints segments among all present shards.
    ///
    /// The difference between the number of segments on every two shard will be not greater than 1 after the
@@ -616,6 +661,28 @@ private:
    /// \param endpoint node that left the cluster
    void drain_for(gms::inet_address endpoint);

+    void update_backlog(size_t backlog, size_t max_backlog);
+
+    bool stopping() const noexcept {
+        return _state.contains(state::stopping);
+    }
+
+    void set_stopping() noexcept {
+        _state.set(state::stopping);
+    }
+
+    bool started() const noexcept {
+        return _state.contains(state::started);
+    }
+
+    void set_started() noexcept {
+        _state.set(state::started);
+    }
+
+    bool replay_allowed() const noexcept {
+        return _state.contains(state::replay_allowed);
+    }
+
 public:
    ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
        return _ep_managers.find(ep_key);
--- a/db/hints/resource_manager.cc
+++ b/db/hints/resource_manager.cc
@@ -27,6 +27,7 @@
 #include "lister.hh"
 #include "disk-error-handler.hh"
 #include "seastarx.hh"
+#include <seastar/core/sleep.hh>

 namespace db {
 namespace hints {
@@ -65,19 +66,28 @@ const std::chrono::seconds space_watchdog::_watchdog_period = std::chrono::secon
 space_watchdog::space_watchdog(shard_managers_set& managers, per_device_limits_map& per_device_limits_map)
    : _shard_managers(managers)
    , _per_device_limits_map(per_device_limits_map)
-    , _timer([this] { on_timer(); })
 {}

 void space_watchdog::start() {
-    _timer.arm(timer_clock_type::now());
+    _started = seastar::async([this] {
+        while (!_as.abort_requested()) {
+            try {
+                on_timer();
+            } catch (...) {
+                resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
+                // Stop all hint generators if space_watchdog callback failed
+                for (manager& shard_manager : _shard_managers) {
+                    shard_manager.forbid_hints();
+                }
+            }
+            seastar::sleep_abortable(_watchdog_period, _as).get();
+        }
+    }).handle_exception_type([] (const seastar::sleep_aborted& ignored) { });
 }

 future<> space_watchdog::stop() noexcept {
-    try {
-        return _gate.close().finally([this] { _timer.cancel(); });
-    } catch (...) {
-        return make_exception_future<>(std::current_exception());
-    }
+    _as.request_abort();
+    return std::move(_started);
 }

 future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
@@ -94,83 +104,62 @@ future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager&
    });
 }

+// Called from the context of a seastar::thread.
 void space_watchdog::on_timer() {
-    with_gate(_gate, [this] {
-        return futurize_apply([this] {
-            _total_size = 0;
+    // The hints directories are organized as follows:
+    // <hints root>
+    //    |- <shard1 ID>
+    //    |  |- <EP1 address>
+    //    |     |- <hints file1>
+    //    |     |- <hints file2>
+    //    |     |- ...
+    //    |  |- <EP2 address>
+    //    |     |- ...
+    //    |  |-...
+    //    |- <shard2 ID>
+    //    |  |- ...
+    //    ...
+    //    |- <shardN ID>
+    //    |  |- ...
+    //

-            return do_for_each(_shard_managers, [this] (manager& shard_manager) {
-                shard_manager.clear_eps_with_pending_hints();
-
-                // The hints directories are organized as follows:
-                // <hints root>
-                //    |- <shard1 ID>
-                //    |  |- <EP1 address>
-                //    |     |- <hints file1>
-                //    |     |- <hints file2>
-                //    |     |- ...
-                //    |  |- <EP2 address>
-                //    |     |- ...
-                //    |  |-...
-                //    |- <shard2 ID>
-                //    |  |- ...
-                //    ...
-                //    |- <shardN ID>
-                //    |  |- ...
+    for (auto& per_device_limits : _per_device_limits_map | boost::adaptors::map_values) {
+        _total_size = 0;
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.clear_eps_with_pending_hints();
+            lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
+                _files_count = 0;
+                // Let's scan per-end-point directories and enumerate hints files...
                //
-                return lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
-                    _files_count = 0;
-                    // Let's scan per-end-point directories and enumerate hints files...
-                    //
-                    // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
-                    // not hintable).
-                    // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
-                    // continue to enumeration - there is no one to change them.
-                    auto it = shard_manager.find_ep_manager(de.name);
-                    if (it != shard_manager.ep_managers_end()) {
-                        return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
-                             return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
-                        });
-                    } else {
-                        return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
-                    }
-                });
-            }).then([this] {
-                return do_for_each(_per_device_limits_map, [this](per_device_limits_map::value_type& per_device_limits_entry) {
-                    space_watchdog::per_device_limits& per_device_limits = per_device_limits_entry.second;
-
-                    size_t adjusted_quota = 0;
-                    size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
-                        return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
+                // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
+                // not hintable).
+                // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
+                // continue to enumeration - there is no one to change them.
+                auto it = shard_manager.find_ep_manager(de.name);
+                if (it != shard_manager.ep_managers_end()) {
+                    return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
+                        return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
                    });
-                    if (per_device_limits.max_shard_disk_space_size > delta) {
-                        adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
-                    }
+                } else {
+                    return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
+                }
+            }).get();
+        }

-                    bool can_hint = _total_size < adjusted_quota;
-                    resource_manager_logger.trace("space_watchdog: total_size ({}) {} max_shard_disk_space_size ({})", _total_size, can_hint ? "<" : ">=", adjusted_quota);
-
-                    if (!can_hint) {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.forbid_hints_for_eps_with_pending_hints();
-                        }
-                    } else {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.allow_hints();
-                        }
-    }
-                });
-            });
-        }).handle_exception([this] (auto eptr) {
-            resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
-            // Stop all hint generators if space_watchdog callback failed
-            for (manager& shard_manager : _shard_managers) {
-                shard_manager.forbid_hints();
-            }
-        }).finally([this] {
-            _timer.arm(_watchdog_period);
+        // Adjust the quota to take into account the space we guarantee to every end point manager
+        size_t adjusted_quota = 0;
+        size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
+            return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
        });
-    });
+        if (per_device_limits.max_shard_disk_space_size > delta) {
+            adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
+        }
+
+        resource_manager_logger.trace("space_watchdog: consuming {}/{} bytes", _total_size, adjusted_quota);
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.update_backlog(_total_size, adjusted_quota);
+        }
+    }
 }

 future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr) {
@@ -183,6 +172,10 @@ future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, s
    });
 }

+void resource_manager::allow_replaying() noexcept {
+    boost::for_each(_shard_managers, [] (manager& m) { m.allow_replaying(); });
+}
+
 future<> resource_manager::stop() noexcept {
    return parallel_for_each(_shard_managers, [](manager& m) {
        return m.stop();
@@ -201,14 +194,18 @@ future<> resource_manager::prepare_per_device_limits() {
        auto it = _per_device_limits_map.find(device_id);
        if (it == _per_device_limits_map.end()) {
            return is_mountpoint(shard_manager.hints_dir().parent_path()).then([this, device_id, &shard_manager](bool is_mountpoint) {
-                // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
-                size_t max_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
-                // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
-                // Then, reserve 90% of all space instead of 10% above.
-                if (is_mountpoint) {
-                    max_size *= 9;
+                auto [it, inserted] = _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{});
+                // Since we possibly deferred, we need to recheck the _per_device_limits_map.
+                if (inserted) {
+                    // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
+                    it->second.max_shard_disk_space_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
+                    // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
+                    // Then, reserve 90% of all space instead of 10% above.
+                    if (is_mountpoint) {
+                        it->second.max_shard_disk_space_size *= 9;
+                    }
                }
-                _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{{std::ref(shard_manager)}, max_size});
+                it->second.managers.emplace_back(std::ref(shard_manager));
            });
        } else {
            it->second.managers.emplace_back(std::ref(shard_manager));
--- a/db/hints/resource_manager.hh
+++ b/db/hints/resource_manager.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include <cstdint>
+#include <seastar/core/abort_source.hh>
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/gate.hh>
 #include <seastar/core/memory.hh>
@@ -78,8 +79,8 @@ private:
    shard_managers_set& _shard_managers;
    per_device_limits_map& _per_device_limits_map;

-    seastar::gate _gate;
-    seastar::timer<timer_clock_type> _timer;
+    future<> _started = make_ready_future<>();
+    seastar::abort_source _as;
    int _files_count = 0;

 public:
@@ -137,6 +138,9 @@ public:
        , _space_watchdog(_shard_managers, _per_device_limits_map)
    {}

+    resource_manager(resource_manager&&) = delete;
+    resource_manager& operator=(resource_manager&&) = delete;
+
    future<semaphore_units<semaphore_default_exception_factory>> get_send_units_for(size_t buf_size);

    bool too_many_hints_in_progress() const {
@@ -156,6 +160,7 @@ public:
    }

    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
+    void allow_replaying() noexcept;
    future<> stop() noexcept;
    void register_manager(manager& m);
    future<> prepare_per_device_limits();
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -87,7 +87,7 @@ future<> system_distributed_keyspace::start() {
        return do_with(all_tables(), [this] (std::vector<schema_ptr>& tables) {
            return do_for_each(tables, [this] (schema_ptr table) {
                return ignore_existing([this, table = std::move(table)] {
-                    return _mm.announce_new_column_family(std::move(table), false);
+                    return _mm.announce_new_column_family(std::move(table), api::min_timestamp, false);
                });
            });
        });
--- a/db/timeout_clock.hh
+++ b/db/timeout_clock.hh
@@ -28,5 +28,6 @@
 namespace db {
 using timeout_clock = seastar::lowres_clock;
 using timeout_semaphore = seastar::basic_semaphore<seastar::default_timeout_exception_factory, timeout_clock>;
+using timeout_semaphore_units = seastar::semaphore_units<seastar::default_timeout_exception_factory, timeout_clock>;
 static constexpr timeout_clock::time_point no_timeout = timeout_clock::time_point::max();
 }
--- a/db/view/node_view_update_backlog.hh
+++ b/db/view/node_view_update_backlog.hh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "db/view/view_update_backlog.hh"
+
+#include <seastar/core/cacheline.hh>
+#include <seastar/core/lowres_clock.hh>
+
+#include <atomic>
+#include <chrono>
+#include <new>
+
+namespace db::view {
+
+/**
+ * An atomic view update backlog representation, safe to update from multiple shards.
+ * It is legal for a stale current max value to be returned.
+ */
+class node_update_backlog {
+    using clock = seastar::lowres_clock;
+    struct per_shard_backlog {
+        // Multiply by 2 to defeat the prefetcher
+        alignas(seastar::cache_line_size * 2) std::atomic<update_backlog> backlog = update_backlog::no_backlog();
+
+        update_backlog load() const {
+            return backlog.load(std::memory_order_relaxed);
+        }
+    };
+    std::vector<per_shard_backlog> _backlogs;
+    std::chrono::milliseconds _interval;
+    std::atomic<clock::time_point> _last_update;
+    std::atomic<update_backlog> _max;
+
+public:
+    explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval)
+            : _backlogs(shards)
+            , _interval(interval)
+            , _last_update(clock::now() - _interval)
+            , _max(update_backlog::no_backlog()) {
+    }
+
+    update_backlog add_fetch(unsigned shard, update_backlog backlog);
+
+    // Exposed for testing only.
+    update_backlog load() const {
+        return _max.load(std::memory_order_relaxed);
+    }
+};
+
+}
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -58,6 +58,7 @@
 #include "cql3/util.hh"
 #include "db/view/view.hh"
 #include "db/view/view_builder.hh"
+#include "frozen_mutation.hh"
 #include "gms/inet_address.hh"
 #include "keys.hh"
 #include "locator/network_topology_strategy.hh"
@@ -226,10 +227,11 @@ public:
            , _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
    }

-    void move_to(std::vector<mutation>& mutations) && {
+    void move_to(std::vector<frozen_mutation_and_schema>& mutations) && {
        auto& partitioner = dht::global_partitioner();
        std::transform(_updates.begin(), _updates.end(), std::back_inserter(mutations), [&, this] (auto&& m) {
-            return mutation(_view, partitioner.decorate_key(*_view, std::move(m.first)), std::move(m.second));
+            auto mut = mutation(_view, partitioner.decorate_key(*_view, std::move(m.first)), std::move(m.second));
+            return frozen_mutation_and_schema{freeze(mut), std::move(_view)};
        });
    }

@@ -627,7 +629,7 @@ public:
            , _now(gc_clock::now()) {
    }

-    future<std::vector<mutation>> build();
+    future<std::vector<frozen_mutation_and_schema>> build();

 private:
    void generate_update(clustering_row&& update, stdx::optional<clustering_row>&& existing);
@@ -664,7 +666,7 @@ private:
    }
 };

-future<std::vector<mutation>> view_update_builder::build() {
+future<std::vector<frozen_mutation_and_schema>> view_update_builder::build() {
    return advance_all().then([this] (auto&& ignored) {
        assert(_update && _update->is_partition_start());
        _key = std::move(std::move(_update)->as_partition_start().key().key());
@@ -679,7 +681,7 @@ future<std::vector<mutation>> view_update_builder::build() {
            });
        });
    }).then([this] {
-        std::vector<mutation> mutations;
+        std::vector<frozen_mutation_and_schema> mutations;
        for (auto&& update : _view_updates) {
            std::move(update).move_to(mutations);
        }
@@ -787,7 +789,7 @@ future<stop_iteration> view_update_builder::on_results() {
    return stop();
 }

-future<std::vector<mutation>> generate_view_updates(
+future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
        std::vector<view_ptr>&& views_to_update,
        flat_mutation_reader&& updates,
@@ -924,16 +926,35 @@ get_view_natural_endpoint(const sstring& keyspace_name,
 // to a modification of a single base partition, and apply them to the
 // appropriate paired replicas. This is done asynchronously - we do not wait
 // for the writes to complete.
-// FIXME: I dropped a lot of parameters the Cassandra version had,
-// we may need them back: writeCommitLog, baseComplete, queryStartNanoTime.
-future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations, db::view::stats& stats)
+future<> mutate_MV(
+        const dht::token& base_token,
+        std::vector<frozen_mutation_and_schema> view_updates,
+        db::view::stats& stats,
+        db::timeout_semaphore_units pending_view_updates)
 {
    auto fs = std::make_unique<std::vector<future<>>>();
-    for (auto& mut : mutations) {
-        auto view_token = mut.token();
-        auto keyspace_name = mut.schema()->ks_name();
+    fs->reserve(view_updates.size());
+    auto& partitioner = dht::global_partitioner();
+    for (frozen_mutation_and_schema& mut : view_updates) {
+        auto view_token = partitioner.get_token(*mut.s, mut.fm.key(*mut.s));
+        auto& keyspace_name = mut.s->ks_name();
        auto paired_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
        auto pending_endpoints = service::get_local_storage_service().get_token_metadata().pending_endpoints_for(view_token, keyspace_name);
+        auto maybe_account_failure = [&stats, units = pending_view_updates.split(mut.fm.representation().size())] (
+                future<>&& f,
+                gms::inet_address target,
+                bool is_local,
+                size_t remotes) {
+            if (f.failed()) {
+                stats.view_updates_failed_local += is_local;
+                stats.view_updates_failed_remote += remotes;
+                auto ep = f.get_exception();
+                vlogger.error("Error applying view update to {}: {}", target, ep);
+                return make_exception_future<>(std::move(ep));
+            } else {
+                return make_ready_future<>();
+            }
+        };
        if (paired_endpoint) {
            // When paired endpoint is the local node, we can just apply
            // the mutation locally, unless there are pending endpoints, in
@@ -951,10 +972,16 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
                // do not wait for it to complete.
                // Note also that mutate_locally(mut) copies mut (in
                // frozen form) so don't need to increase its lifetime.
-                fs->push_back(service::get_local_storage_proxy().mutate_locally(mut).handle_exception([&stats] (auto ep) {
-                    vlogger.error("Error applying local view update: {}", ep);
-                    stats.view_updates_failed_local++;
-                    return make_exception_future<>(std::move(ep));
+                // send_to_endpoint() below updates statistics on pending
+                // writes but mutate_locally() doesn't, so we need to do that here.
+                ++stats.writes;
+                auto mut_ptr = std::make_unique<frozen_mutation>(std::move(mut.fm));
+                fs->push_back(service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr).then_wrapped(
+                        [&stats,
+                         maybe_account_failure = std::move(maybe_account_failure),
+                         mut_ptr = std::move(mut_ptr)] (future<>&& f) {
+                    --stats.writes;
+                    return maybe_account_failure(std::move(f), utils::fb_utilities::get_broadcast_address(), true, 0);
                }));
            } else {
                vlogger.debug("Sending view update to endpoint {}, with pending endpoints = {}", *paired_endpoint, pending_endpoints);
@@ -965,14 +992,17 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
                // to send the update there. Currently, we do this from *each* of
                // the base replicas, but this is probably excessive - see
                // See https://issues.apache.org/jira/browse/CASSANDRA-14262/
-                fs->push_back(service::get_local_storage_proxy().send_to_endpoint(std::move(mut), *paired_endpoint, std::move(pending_endpoints), db::write_type::VIEW, stats)
-                        .handle_exception([paired_endpoint, is_endpoint_local, updates_pushed_remote, &stats] (auto ep) {
-                            stats.view_updates_failed_local += is_endpoint_local;
-                            stats.view_updates_failed_remote += updates_pushed_remote;
-                            vlogger.error("Error applying view update to {}: {}", *paired_endpoint, ep);
-                            return make_exception_future<>(std::move(ep));
-                        })
-                );
+                fs->push_back(service::get_local_storage_proxy().send_to_endpoint(
+                        std::move(mut),
+                        *paired_endpoint,
+                        std::move(pending_endpoints),
+                        db::write_type::VIEW, stats).then_wrapped(
+                                [paired_endpoint,
+                                 is_endpoint_local,
+                                 updates_pushed_remote,
+                                 maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) mutable {
+                    return maybe_account_failure(std::move(f), std::move(*paired_endpoint), is_endpoint_local, updates_pushed_remote);
+                }));
            }
        } else if (!pending_endpoints.empty()) {
            // If there is no paired endpoint, it means there's a range movement going on (decommission or move),
@@ -992,10 +1022,11 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
                    std::move(mut),
                    target,
                    std::move(pending_endpoints),
-                    db::write_type::VIEW).handle_exception([target, updates_pushed_remote, &stats] (auto ep) {
-                stats.view_updates_failed_remote += updates_pushed_remote;
-                vlogger.error("Error applying view update to {}: {}", target, ep);
-                return make_exception_future<>(std::move(ep));
+                    db::write_type::VIEW).then_wrapped(
+                            [target,
+                             updates_pushed_remote,
+                             maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) {
+                return maybe_account_failure(std::move(f), std::move(target), false, updates_pushed_remote);
            }));
        }
    }
@@ -1226,6 +1257,20 @@ future<> view_builder::calculate_shard_build_step(
        }
    }

+    // All shards need to arrive at the same decisions on whether or not to
+    // restart a view build at some common token (reshard), and which token
+    // to restart at. So we need to wait until all shards have read the view
+    // build statuses before they can all proceed to make the (same) decision.
+    // If we don't synchronoize here, a fast shard may make a decision, start
+    // building and finish a build step - before the slowest shard even read
+    // the view build information.
+    container().invoke_on(0, [] (view_builder& builder) {
+        if (++builder._shards_finished_read == smp::count) {
+            builder._shards_finished_read_promise.set_value();
+        }
+        return builder._shards_finished_read_promise.get_shared_future();
+    }).get();
+
    std::unordered_set<utils::UUID> loaded_views;
    if (view_build_status_per_shard.size() != smp::count) {
        reshard(std::move(view_build_status_per_shard), loaded_views);
@@ -1419,7 +1464,16 @@ private:
    built_views _built_views;
    std::vector<view_ptr> _views_to_build;
    std::deque<mutation_fragment> _fragments;
-
+    // The compact_for_query<> that feeds this consumer is already configured
+    // to feed us up to view_builder::batchsize (128) rows and not an entire
+    // partition. Still, if rows contain large blobs, saving 128 of them in
+    // _fragments may be too much. So we want to track _fragment's memory
+    // usage, and flush the _fragments if it has grown too large.
+    // Additionally, limiting _fragment's size also solves issue #4213:
+    // A single view mutation can be as large as the size of the base rows
+    // used to build it, and we cannot allow its serialized size to grow
+    // beyond our limit on mutation size (by default 32 MB).
+    size_t _fragments_memory_usage = 0;
 public:
    consumer(view_builder& builder, build_step& step)
            : _builder(builder)
@@ -1482,7 +1536,15 @@ public:
            return stop_iteration::yes;
        }

+        _fragments_memory_usage += cr.memory_usage(*_step.base->schema());
        _fragments.push_back(std::move(cr));
+        if (_fragments_memory_usage > 1024*1024) {
+            // Although we have not yet completed the batch of base rows that
+            // compact_for_query<> planned for us (view_builder::batchsize),
+            // we've still collected enough rows to reach sizeable memory use,
+            // so let's flush these rows now.
+            flush_fragments();
+        }
        return stop_iteration::no;
    }

@@ -1490,7 +1552,7 @@ public:
        return stop_iteration::no;
    }

-    stop_iteration consume_end_of_partition() {
+    void flush_fragments() {
        _builder._as.check();
        if (!_fragments.empty()) {
            _fragments.push_front(partition_start(_step.current_key, tombstone()));
@@ -1499,7 +1561,12 @@ public:
                    _step.current_token(),
                    make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments))).get();
            _fragments.clear();
+            _fragments_memory_usage = 0;
        }
+    }
+
+    stop_iteration consume_end_of_partition() {
+        flush_fragments();
        return stop_iteration(_step.build_status.empty());
    }

@@ -1591,12 +1658,29 @@ future<> view_builder::maybe_mark_view_as_built(view_ptr view, dht::token next_t
    });
 }

-future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout) {
-    return container().invoke_on(0, [ks_name, view_name, timeout] (view_builder& builder) {
+future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name) {
+    return container().invoke_on(0, [ks_name, view_name] (view_builder& builder) {
        auto v = std::pair(std::move(ks_name), std::move(view_name));
-        return builder._build_notifiers[std::move(v)].get_shared_future(timeout);
+        return builder._build_notifiers[std::move(v)].get_shared_future();
    });
 }

+update_backlog node_update_backlog::add_fetch(unsigned shard, update_backlog backlog) {
+    _backlogs[shard].backlog.store(backlog, std::memory_order_relaxed);
+    auto now = clock::now();
+    if (now >= _last_update.load(std::memory_order_relaxed) + _interval) {
+        _last_update.store(now, std::memory_order_relaxed);
+        auto new_max = boost::accumulate(
+                _backlogs,
+                update_backlog::no_backlog(),
+                [] (const update_backlog& lhs, const per_shard_backlog& rhs) {
+                    return std::max(lhs, rhs.load());
+                });
+        _max.store(new_max, std::memory_order_relaxed);
+        return new_max;
+    }
+    return std::max(backlog, _max.load(std::memory_order_relaxed));
+}
+
 } // namespace view
 } // namespace db
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -30,6 +30,10 @@
 #include "flat_mutation_reader.hh"
 #include "stdx.hh"

+#include <seastar/core/semaphore.hh>
+
+class frozen_mutation_and_schema;
+
 namespace db {

 namespace view {
@@ -90,7 +94,7 @@ bool matches_view_filter(const schema& base, const view_info& view, const partit

 bool clustering_prefix_matches(const schema& base, const partition_key& key, const clustering_key_prefix& ck);

-future<std::vector<mutation>> generate_view_updates(
+future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
        std::vector<view_ptr>&& views_to_update,
        flat_mutation_reader&& updates,
@@ -102,7 +106,11 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
        const mutation_partition& mp,
        const std::vector<view_ptr>& views);

-future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations, db::view::stats& stats);
+future<> mutate_MV(
+        const dht::token& base_token,
+        std::vector<frozen_mutation_and_schema> view_updates,
+        db::view::stats& stats,
+        db::timeout_semaphore_units pending_view_updates);

 /**
 * create_virtual_column() adds a "virtual column" to a schema builder.
--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -151,6 +151,10 @@ class view_builder final : public service::migration_listener::only_view_notific
    future<> _started = make_ready_future<>();
    // Used to coordinate between shards the conclusion of the build process for a particular view.
    std::unordered_set<utils::UUID> _built_views;
+    // Counter and promise (both on shard 0 only!) allowing to wait for all
+    // shards to have read the view build statuses
+    unsigned _shards_finished_read = 0;
+    seastar::shared_promise<> _shards_finished_read_promise;
    // Used for testing.
    std::unordered_map<std::pair<sstring, sstring>, seastar::shared_promise<>, utils::tuple_hash> _build_notifiers;

@@ -178,7 +182,7 @@ public:
    virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override;

    // For tests
-    future<> wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout);
+    future<> wait_until_built(const sstring& ks_name, const sstring& view_name);

 private:
    build_step& get_or_create_build_step(utils::UUID);
--- a/db/view/view_update_backlog.hh
+++ b/db/view/view_update_backlog.hh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <limits>
+
+namespace db::view {
+
+/**
+ * The view update backlog represents the pending view data that a base replica
+ * maintains. It is the maximum of the memory backlog - how much memory pending
+ * view updates are consuming out of the their allocated quota - and the disk
+ * backlog - how much view hints are consuming. The size of a backlog is relative
+ * to its maximum size.
+ */
+struct update_backlog {
+    size_t current;
+    size_t max;
+
+    float relative_size() const {
+        return float(current) / float(max);
+    }
+
+    friend bool operator==(const update_backlog& lhs, const update_backlog& rhs) {
+        return lhs.relative_size() == rhs.relative_size();
+    }
+
+    friend bool operator<(const update_backlog& lhs, const update_backlog& rhs) {
+        return lhs.relative_size() < rhs.relative_size();
+    }
+
+    friend bool operator!=(const update_backlog& lhs, const update_backlog& rhs) {
+        return !(lhs == rhs);
+    }
+
+    friend bool operator<=(const update_backlog& lhs, const update_backlog& rhs) {
+        return !(rhs < lhs);
+    }
+
+    friend bool operator>(const update_backlog& lhs, const update_backlog& rhs) {
+        return rhs < lhs;
+    }
+
+    friend bool operator>=(const update_backlog& lhs, const update_backlog& rhs) {
+        return !(lhs < rhs);
+    }
+
+    static update_backlog no_backlog() {
+        return update_backlog{0, std::numeric_limits<size_t>::max()};
+    }
+};
+
+}
--- a/db/view/view_update_from_staging_generator.cc
+++ b/db/view/view_update_from_staging_generator.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "view_update_from_staging_generator.hh"
+
+namespace db::view {
+
+future<> view_update_from_staging_generator::start() {
+    _started = seastar::async([this]() mutable {
+        while (!_as.abort_requested()) {
+            if (_sstables_with_tables.empty()) {
+                _pending_sstables.wait().get();
+            }
+            while (!_sstables_with_tables.empty()) {
+                auto& entry = _sstables_with_tables.front();
+                schema_ptr s = entry.t->schema();
+                flat_mutation_reader staging_sstable_reader = entry.sst->read_rows_flat(s);
+                auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, _proxy, entry.sst, _as), db::no_timeout);
+                if (result == stop_iteration::yes) {
+                    break;
+                }
+                entry.t->move_sstable_from_staging_in_thread(entry.sst);
+                _registration_sem.signal();
+                _sstables_with_tables.pop_front();
+            }
+        }
+    });
+    return make_ready_future<>();
+}
+
+future<> view_update_from_staging_generator::stop() {
+    _as.request_abort();
+    _pending_sstables.signal();
+    return std::move(_started).then([this] {
+        _registration_sem.broken();
+    });
+}
+
+future<> view_update_from_staging_generator::register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table) {
+    if (_as.abort_requested()) {
+        return make_ready_future<>();
+    }
+    _sstables_with_tables.emplace_back(std::move(sst), std::move(table));
+    _pending_sstables.signal();
+    return _registration_sem.wait(1);
+}
+
+}
--- a/db/view/view_update_from_staging_generator.hh
+++ b/db/view/view_update_from_staging_generator.hh
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "database.hh"
+#include "sstables/sstables.hh"
+#include "db/view/view_updating_consumer.hh"
+
+#include <seastar/core/abort_source.hh>
+#include <seastar/core/condition-variable.hh>
+#include <seastar/core/semaphore.hh>
+
+namespace db::view {
+
+class view_update_from_staging_generator {
+    static constexpr size_t registration_queue_size = 5;
+    database& _db;
+    service::storage_proxy& _proxy;
+    seastar::abort_source _as;
+    future<> _started = make_ready_future<>();
+    seastar::condition_variable _pending_sstables;
+    semaphore _registration_sem{registration_queue_size};
+    struct sstable_with_table {
+        sstables::shared_sstable sst;
+        lw_shared_ptr<table> t;
+        sstable_with_table(sstables::shared_sstable sst, lw_shared_ptr<table> t) : sst(sst), t(t) { }
+    };
+    std::deque<sstable_with_table> _sstables_with_tables;
+public:
+    view_update_from_staging_generator(database& db, service::storage_proxy& proxy) : _db(db), _proxy(proxy) { }
+
+    future<> start();
+    future<> stop();
+    future<> register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table);
+};
+
+}
--- a/db/view/view_updating_consumer.hh
+++ b/db/view/view_updating_consumer.hh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "service/storage_proxy.hh"
+#include "dht/i_partitioner.hh"
+#include "schema.hh"
+#include "mutation_fragment.hh"
+#include "sstables/shared_sstable.hh"
+
+namespace db::view {
+
+/*
+ * A consumer that pushes materialized view updates for each consumed mutation.
+ * It is expected to be run in seastar::async threaded context through consume_in_thread()
+ */
+class view_updating_consumer {
+    schema_ptr _schema;
+    lw_shared_ptr<table> _table;
+    sstables::shared_sstable _excluded_sstable;
+    const seastar::abort_source& _as;
+    std::optional<mutation> _m;
+public:
+    view_updating_consumer(schema_ptr schema, service::storage_proxy& proxy, sstables::shared_sstable excluded_sstable, const seastar::abort_source& as)
+            : _schema(std::move(schema))
+            , _table(proxy.get_db().local().find_column_family(_schema->id()).shared_from_this())
+            , _excluded_sstable(excluded_sstable)
+            , _as(as)
+            , _m()
+    { }
+
+    void consume_new_partition(const dht::decorated_key& dk) {
+        _m = mutation(_schema, dk, mutation_partition(_schema));
+    }
+
+    void consume(tombstone t) {
+        _m->partition().apply(std::move(t));
+    }
+
+    stop_iteration consume(static_row&& sr) {
+        if (_as.abort_requested()) {
+            return stop_iteration::yes;
+        }
+        _m->partition().apply(*_schema, std::move(sr));
+        return stop_iteration::no;
+    }
+
+    stop_iteration consume(clustering_row&& cr) {
+        if (_as.abort_requested()) {
+            return stop_iteration::yes;
+        }
+        _m->partition().apply(*_schema, std::move(cr));
+        return stop_iteration::no;
+    }
+
+    stop_iteration consume(range_tombstone&& rt) {
+        if (_as.abort_requested()) {
+            return stop_iteration::yes;
+        }
+        _m->partition().apply(*_schema, std::move(rt));
+        return stop_iteration::no;
+    }
+
+    // Expected to be run in seastar::async threaded context (consume_in_thread())
+    stop_iteration consume_end_of_partition();
+
+    stop_iteration consume_end_of_stream() {
+        return stop_iteration(_as.abort_requested());
+    }
+};
+
+}
+
--- a/dht/boot_strapper.cc
+++ b/dht/boot_strapper.cc
@@ -49,22 +49,24 @@ namespace dht {
 future<> boot_strapper::bootstrap() {
    blogger.debug("Beginning bootstrap process: sorted_tokens={}", _token_metadata.sorted_tokens());

-    auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap");
+    auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap", streaming::stream_reason::bootstrap);
    streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_failure_detector()));
-    for (const auto& keyspace_name : _db.local().get_non_system_keyspaces()) {
+    auto keyspaces = make_lw_shared<std::vector<sstring>>(_db.local().get_non_system_keyspaces());
+    return do_for_each(*keyspaces, [this, keyspaces, streamer] (sstring& keyspace_name) {
        auto& ks = _db.local().find_keyspace(keyspace_name);
        auto& strategy = ks.get_replication_strategy();
        dht::token_range_vector ranges = strategy.get_pending_address_ranges(_token_metadata, _tokens, _address);
        blogger.debug("Will stream keyspace={}, ranges={}", keyspace_name, ranges);
-        streamer->add_ranges(keyspace_name, ranges);
-    }
-
-    return streamer->stream_async().then([streamer] () {
-        service::get_local_storage_service().finish_bootstrapping();
-    }).handle_exception([streamer] (std::exception_ptr eptr) {
-        blogger.warn("Error during bootstrap: {}", eptr);
-        return make_exception_future<>(std::move(eptr));
+        return streamer->add_ranges(keyspace_name, ranges);
+    }).then([this, streamer] {
+        return streamer->stream_async().then([streamer] () {
+            service::get_local_storage_service().finish_bootstrapping();
+        }).handle_exception([streamer] (std::exception_ptr eptr) {
+            blogger.warn("Error during bootstrap: {}", eptr);
+            return make_exception_future<>(std::move(eptr));
+        });
    });
+
 }

 std::unordered_set<token> boot_strapper::get_bootstrap_tokens(token_metadata metadata, database& db) {
--- a/dht/range_streamer.cc
+++ b/dht/range_streamer.cc
@@ -114,6 +114,9 @@ range_streamer::get_all_ranges_with_sources_for(const sstring& keyspace_name, dh
    for (auto& desired_range : desired_ranges) {
        auto found = false;
        for (auto& x : range_addresses) {
+            if (need_preempt()) {
+                seastar::thread::yield();
+            }
            const range<token>& src_range = x.first;
            if (src_range.contains(desired_range, dht::tri_compare)) {
                std::vector<inet_address>& addresses = x.second;
@@ -157,6 +160,9 @@ range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_n
    for (auto& desired_range : desired_ranges) {
        for (auto& x : range_addresses) {
            const range<token>& src_range = x.first;
+            if (need_preempt()) {
+                seastar::thread::yield();
+            }
            if (src_range.contains(desired_range, dht::tri_compare)) {
                std::vector<inet_address> old_endpoints(x.second.begin(), x.second.end());
                auto it = pending_range_addresses.find(desired_range);
@@ -226,7 +232,8 @@ void range_streamer::add_rx_ranges(const sstring& keyspace_name, std::unordered_
 }

 // TODO: This is the legacy range_streamer interface, it is add_rx_ranges which adds rx ranges.
-void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges) {
+future<> range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges) {
+  return seastar::async([this, keyspace_name, ranges= std::move(ranges)] () mutable {
    if (_nr_tx_added) {
        throw std::runtime_error("Mixed sending and receiving is not supported");
    }
@@ -249,6 +256,7 @@ void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_v
        }
    }
    _to_stream.emplace(keyspace_name, std::move(range_fetch_map));
+  });
 }

 future<> range_streamer::stream_async() {
@@ -294,7 +302,7 @@ future<> range_streamer::do_stream_async() {
                size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
                dht::token_range_vector ranges_to_stream;
                auto do_streaming = [&] {
-                    auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++));
+                    auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++), _reason);
                    logger.info("{} with {} for keyspace={}, {} out of {} ranges: ranges = {}",
                            description, source, keyspace, nr_ranges_streamed, nr_ranges_total, ranges_to_stream.size());
                    if (_nr_rx_added) {
--- a/dht/range_streamer.hh
+++ b/dht/range_streamer.hh
@@ -42,6 +42,7 @@
 #include "locator/snitch_base.hh"
 #include "streaming/stream_plan.hh"
 #include "streaming/stream_state.hh"
+#include "streaming/stream_reason.hh"
 #include "gms/inet_address.hh"
 #include "gms/i_failure_detector.hh"
 #include "range.hh"
@@ -101,24 +102,25 @@ public:
        }
    };

-    range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description)
+    range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description, streaming::stream_reason reason)
        : _db(db)
        , _metadata(tm)
        , _tokens(std::move(tokens))
        , _address(address)
        , _description(std::move(description))
+        , _reason(reason)
        , _stream_plan(_description) {
    }

-    range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description)
-        : range_streamer(db, tm, std::unordered_set<token>(), address, description) {
+    range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description, streaming::stream_reason reason)
+        : range_streamer(db, tm, std::unordered_set<token>(), address, description, reason) {
    }

    void add_source_filter(std::unique_ptr<i_source_filter> filter) {
        _source_filters.emplace(std::move(filter));
    }

-    void add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges);
+    future<> add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges);
    void add_tx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint);
    void add_rx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint);
 private:
@@ -166,6 +168,7 @@ private:
    std::unordered_set<token> _tokens;
    inet_address _address;
    sstring _description;
+    streaming::stream_reason _reason;
    std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_stream;
    std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
    stream_plan _stream_plan;
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -78,7 +78,7 @@ if [ $LOCALRPM -eq 1 ]; then
    fi
    if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
        cd build
-        git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
+        git clone -b branch-3.0 --depth 1 https://github.com/scylladb/scylla-jmx.git
        cd scylla-jmx
        dist/redhat/build_rpm.sh --target epel-7-x86_64
        cd ../..
--- a/dist/common/scripts/node_exporter_install
+++ b/dist/common/scripts/node_exporter_install
@@ -25,7 +25,7 @@ import tempfile
 import tarfile
 from scylla_util import *

-VERSION='0.14.0'
+VERSION='0.17.0'
 INSTALL_DIR='/usr/lib/scylla/Prometheus/node_exporter'

 if __name__ == '__main__':
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -62,10 +62,9 @@ if __name__ == '__main__':
            run('hugeadm --create-mounts')
        fi
    else:
-        set_nic = cfg.get('SET_NIC')
+        set_nic_and_disks = get_set_nic_and_disks_config_value(cfg)
        ifname = cfg.get('IFNAME')
-        if set_nic  == 'yes':
+        if set_nic_and_disks == 'yes':
            create_perftune_conf(ifname)
-            run('/usr/lib/scylla/posix_net_conf.sh {IFNAME} --options-file /etc/scylla.d/perftune.yaml'.format(IFNAME=ifname))
+            run("{} --options-file /etc/scylla.d/perftune.yaml".format(perftune_base_command()))

-    run('/usr/lib/scylla/scylla-blocktune')
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -122,8 +122,8 @@ if __name__ == '__main__':
                        help='specify NTP domain')
    parser.add_argument('--ami', action='store_true', default=False,
                        help='setup AMI instance')
-    parser.add_argument('--setup-nic', action='store_true', default=False,
-                        help='optimize NIC queue')
+    parser.add_argument('--setup-nic-and-disks', action='store_true', default=False,
+                        help='optimize NIC and disks')
    parser.add_argument('--developer-mode', action='store_true', default=False,
                        help='enable developer mode')
    parser.add_argument('--no-ec2-check', action='store_true', default=False,
@@ -173,7 +173,7 @@ if __name__ == '__main__':

    disks = args.disks
    nic = args.nic
-    set_nic = args.setup_nic
+    set_nic_and_disks = args.setup_nic_and_disks
    ec2_check = not args.no_ec2_check
    kernel_check = not args.no_kernel_check
    verify_package = not args.no_verify_package
@@ -336,11 +336,11 @@ if __name__ == '__main__':
    if interactive:
        sysconfig_setup = interactive_ask_service('Do you want to setup a system-wide customized configuration for Scylla?', 'Yes - setup the sysconfig file. No - skips this step.', 'yes')
    if sysconfig_setup:
-        nic = interactive_choose_nic()
        if interactive:
-            set_nic = interactive_ask_service('Do you want to enable Network Interface Card (NIC) optimization?', 'Yes - optimize the NIC queue settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
+            nic = interactive_choose_nic()
+            set_nic_and_disks = interactive_ask_service('Do you want to enable Network Interface Card (NIC) and disk(s) optimization?', 'Yes - optimize the NIC queue and disks settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
    if sysconfig_setup:
-        setup_args = '--setup-nic' if set_nic else ''
+        setup_args = '--setup-nic-and-disks' if set_nic_and_disks else ''
        run_setup_script('NIC queue', '/usr/lib/scylla/scylla_sysconfig_setup --nic {nic} {setup_args}'.format(nic=nic, setup_args=setup_args))

    if interactive:
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -40,7 +40,7 @@ if __name__ == '__main__':
        cfg = sysconfig_parser('/etc/sysconfig/scylla-server')
    else:
        cfg = sysconfig_parser('/etc/default/scylla-server')
-    set_nic = str2bool(cfg.get('SET_NIC'))
+    set_nic_and_disks = str2bool(get_set_nic_and_disks_config_value(cfg))
    ami = str2bool(cfg.get('AMI'))

    parser = argparse.ArgumentParser(description='Setting parameters on Scylla sysconfig file.')
@@ -58,8 +58,8 @@ if __name__ == '__main__':
                        help='scylla home directory')
    parser.add_argument('--confdir',
                        help='scylla config directory')
-    parser.add_argument('--setup-nic', action='store_true', default=set_nic,
-                        help='setup NIC\'s interrupts, RPS, XPS')
+    parser.add_argument('--setup-nic-and-disks', action='store_true', default=set_nic_and_disks,
+                        help='setup NIC\'s and disks\' interrupts, RPS, XPS, nomerges and I/O scheduler')
    parser.add_argument('--ami', action='store_true', default=ami,
                        help='AMI instance mode')
    args = parser.parse_args()
@@ -71,8 +71,8 @@ if __name__ == '__main__':
    ifname = args.nic if args.nic else cfg.get('IFNAME')
    network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')

-    if args.setup_nic:
-        rps_cpus = out('/usr/lib/scylla/posix_net_conf.sh --cpu-mask {}'.format(ifname))
+    if args.setup_nic_and_disks:
+        rps_cpus = out('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname))
        if len(rps_cpus) > 0:
            cpuset = hex2list(rps_cpus)
            run('/usr/lib/scylla/scylla_cpuset_setup --cpuset {}'.format(cpuset))
@@ -104,8 +104,13 @@ if __name__ == '__main__':
        cfg.set('SCYLLA_HOME', args.homedir)
    if args.confdir:
        cfg.set('SCYLLA_CONF', args.confdir)
-    if str2bool(cfg.get('SET_NIC')) != args.setup_nic:
-        cfg.set('SET_NIC', bool2str(args.setup_nic))
+
+    if str2bool(get_set_nic_and_disks_config_value(cfg)) != args.setup_nic_and_disks:
+        if cfg.has_option('SET_NIC'):
+            cfg.set('SET_NIC', bool2str(args.setup_nic_and_disks))
+        else:
+            cfg.set('SET_NIC_AND_DISKS', bool2str(args.setup_nic_and_disks))
+
    if str2bool(cfg.get('AMI')) != args.ami:
        cfg.set('AMI', bool2str(args.ami))
    cfg.commit()
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -28,6 +28,7 @@ import time
 import urllib.error
 import urllib.parse
 import urllib.request
+import yaml


 def curl(url, byte=False):
@@ -384,6 +385,37 @@ def get_mode_cpuset(nic, mode):
    except subprocess.CalledProcessError:
        return '-1'

+def get_scylla_dirs():
+    """
+    Returns a list of scylla directories configured in /etc/scylla/scylla.yaml.
+    Verifies that mandatory parameters are set.
+    """
+    scylla_yaml_name = '/etc/scylla/scylla.yaml'
+    y = yaml.load(open(scylla_yaml_name))
+
+    # Check that mandatory fields are set
+    if 'data_file_directories' not in y or \
+            not y['data_file_directories'] or \
+            not len(y['data_file_directories']) or \
+            not " ".join(y['data_file_directories']).strip():
+        raise Exception("{}: at least one directory has to be set in 'data_file_directory'".format(scylla_yaml_name))
+    if 'commitlog_directory' not in y or not y['commitlog_directory']:
+        raise Exception("{}: 'commitlog_directory' has to be set".format(scylla_yaml_name))
+
+    dirs = []
+    dirs.extend(y['data_file_directories'])
+    dirs.append(y['commitlog_directory'])
+
+    if 'hints_directory' in y and y['hints_directory']:
+        dirs.append(y['hints_directory'])
+    if 'view_hints_directory' in y and y['view_hints_directory']:
+        dirs.append(y['view_hints_directory'])
+
+    return [d for d in dirs if d is not None]
+
+def perftune_base_command():
+    disk_tune_param = "--tune disks " + " ".join("--dir {}".format(d) for d in get_scylla_dirs())
+    return '/usr/lib/scylla/perftune.py {}'.format(disk_tune_param)

 def get_cur_cpuset():
    cfg = sysconfig_parser('/etc/scylla.d/cpuset.conf')
@@ -419,6 +451,25 @@ def create_perftune_conf(nic='eth0'):
 def is_valid_nic(nic):
    return os.path.exists('/sys/class/net/{}'.format(nic))

+# Remove this when we do not support SET_NIC configuration value anymore
+def get_set_nic_and_disks_config_value(cfg):
+    """
+    Get the SET_NIC_AND_DISKS configuration value.
+    Return the SET_NIC configuration value if SET_NIC_AND_DISKS is not found (old releases case).
+    :param cfg: sysconfig_parser object
+    :return configuration value
+    :except If the configuration value is not found
+    """
+
+    # Sanity check
+    if cfg.has_option('SET_NIC_AND_DISKS') and cfg.has_option('SET_NIC'):
+        raise Exception("Only one of 'SET_NIC_AND_DISKS' and 'SET_NIC' is allowed to be present")
+
+    try:
+        return cfg.get('SET_NIC_AND_DISKS')
+    except:
+        # For backwards compatibility
+        return cfg.get('SET_NIC')

 class SystemdException(Exception):
    pass
@@ -483,8 +534,11 @@ class sysconfig_parser:
    def get(self, key):
        return self._cfg.get('global', key).strip('"')

+    def has_option(self, key):
+        return self._cfg.has_option('global', key)
+
    def set(self, key, val):
-        if not self._cfg.has_option('global', key):
+        if not self.has_option(key):
            return self.__add(key, val)
        self._data = re.sub('^{}=[^\n]*$'.format(key), '{}="{}"'.format(key, self.__escape(val)), self._data, flags=re.MULTILINE)
        self.__load()
--- a/dist/common/sysconfig/scylla-server
+++ b/dist/common/sysconfig/scylla-server
@@ -10,8 +10,8 @@ BRIDGE=virbr0
 # ethernet device name
 IFNAME=eth0

-# setup NIC's interrupts, RPS, XPS (posix)
-SET_NIC=no
+# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
+SET_NIC_AND_DISKS=no

 # ethernet device driver (dpdk)
 ETHDRV=
--- a/dist/common/sysctl.d/99-scylla-aio.conf
+++ b/dist/common/sysctl.d/99-scylla-aio.conf
@@ -0,0 +1,2 @@
+# Raise max AIO events
+fs.aio-max-nr = 1048576
--- a/dist/common/systemd/node-exporter.service
+++ b/dist/common/systemd/node-exporter.service
@@ -5,7 +5,7 @@ Description=Node Exporter
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/bin/node_exporter -collectors.enabled interrupts,conntrack,diskstats,entropy,filefd,filesystem,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat
+ExecStart=/usr/bin/node_exporter  --collector.interrupts

 [Install]
 WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-housekeeping-restart.service.mustache
+++ b/dist/common/systemd/scylla-housekeeping-restart.service.mustache
@@ -6,7 +6,12 @@ After=network.target
 Type=simple
 User=scylla
 Group=scylla
+{{#debian}}
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/apt/sources.list.d/scylla*.list' version --mode r
+{{/debian}}
+{{#redhat}}
 ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/yum.repos.d/scylla*.repo' version --mode r
+{{/redhat}}

 [Install]
 WantedBy=multi-user.target
--- a/dist/debian/debian/scylla-kernel-conf.install
+++ b/dist/debian/debian/scylla-kernel-conf.install
@@ -1 +1,2 @@
 dist/common/sysctl.d/99-scylla-sched.conf /etc/sysctl.d
+dist/common/sysctl.d/99-scylla-aio.conf /etc/sysctl.d
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -9,6 +9,7 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
 else
    # expect failures in virtualized environments
    sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
+    sysctl -p/etc/sysctl.d/99-scylla-aio.conf || :
 fi

 #DEBHELPER#
--- a/dist/debian/debian/scylla-server.dirs
+++ b/dist/debian/debian/scylla-server.dirs
@@ -4,5 +4,6 @@ var/lib/scylla
 var/lib/scylla/data
 var/lib/scylla/commitlog
 var/lib/scylla/hints
+var/lib/scylla/view_hints
 var/lib/scylla/coredump
 var/lib/scylla-housekeeping
--- a/dist/debian/rules.mustache
+++ b/dist/debian/rules.mustache
@@ -4,7 +4,7 @@ export PYBUILD_DISABLE=1
 jobs := $(shell echo $$DEB_BUILD_OPTIONS | sed -r "s/.*parallel=([0-9]+).*/-j\1/")

 override_dh_auto_configure:
-	./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"
+	./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --c-compiler=/opt/scylladb/bin/gcc-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"

 override_dh_auto_build:
 	PATH="/opt/scylladb/bin:$$PATH" ninja $(jobs)
--- a/dist/debian/scylla-server.install.mustache
+++ b/dist/debian/scylla-server.install.mustache
@@ -1,7 +1,6 @@
 dist/common/limits.d/scylla.conf etc/security/limits.d
 dist/common/scylla.d/*.conf etc/scylla.d
 seastar/dpdk/usertools/dpdk-devbind.py usr/lib/scylla
-seastar/scripts/posix_net_conf.sh usr/lib/scylla
 seastar/scripts/perftune.py usr/lib/scylla
 dist/common/scripts/* usr/lib/scylla
 scylla-housekeeping usr/lib/scylla
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -26,7 +26,7 @@ ADD commandlineparser.py /commandlineparser.py
 ADD docker-entrypoint.py /docker-entrypoint.py

 # Install Scylla:
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo && \
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-3.0.repo -o /etc/yum.repos.d/scylla.repo && \
    yum -y install epel-release && \
    yum -y clean expire-cache && \
    yum -y update && \
--- a/dist/docker/redhat/etc/sysconfig/scylla-server
+++ b/dist/docker/redhat/etc/sysconfig/scylla-server
@@ -10,8 +10,8 @@ BRIDGE=virbr0
 # ethernet device name
 IFNAME=eth0

-# setup NIC's interrupts, RPS, XPS (posix)
-SET_NIC=no
+# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
+SET_NIC_AND_DISKS=no

 # ethernet device driver (dpdk)
 ETHDRV=
--- a/dist/offline_installer/redhat/build_offline_installer.sh
+++ b/dist/offline_installer/redhat/build_offline_installer.sh
@@ -91,7 +91,27 @@ mkdir -p build/offline_installer
 cp dist/offline_installer/redhat/header build/offline_installer
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve scylla
 # XXX: resolve option doesn't fetch some dependencies, need to manually fetch them
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sudo.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntp.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libedit.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntpdate.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve net-tools.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve kernel
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve grubby.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve linux-firmware
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve initscripts.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iproute.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iptables.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnfnetlink.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnetfilter_conntrack.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libmnl.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sysvinit-tools.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve yajl.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve mdadm.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libreport-filesystem.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve xfsprogs.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve PyYAML.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libyaml.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libjpeg-turbo.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libaio.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve snappy.x86_64
--- a/dist/redhat/build_rpm.sh
+++ b/dist/redhat/build_rpm.sh
@@ -108,11 +108,11 @@ fix_ownership() {
 if [ $JOBS -gt 0 ]; then
    RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
 fi
-sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/$PRODUCT-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
+sudo mock --rootdir=`pwd`/build/mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/$PRODUCT-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
 fix_ownership build/srpms
 if [[ "$TARGET" =~ ^epel-7- ]]; then
    TARGET=scylla-$TARGET
    RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
 fi
-sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/$PRODUCT-$VERSION*.src.rpm
+sudo mock --rootdir=`pwd`/build/mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/$PRODUCT-$VERSION*.src.rpm
 fix_ownership build/rpms
--- a/dist/redhat/scylla.spec.mustache
+++ b/dist/redhat/scylla.spec.mustache
@@ -56,7 +56,7 @@ License:        AGPLv3
 URL:            http://www.scylladb.com/
 BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler systemtap-sdt-devel ninja-build cmake python ragel grep kernel-headers
 %{?fedora:BuildRequires: boost-devel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum python2-pystache}
-%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python34 scylla-gcc73-c++, scylla-python34-pyparsing20 yaml-cpp-static pystache python-setuptools}
+%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-libatomic73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python34 scylla-gcc73-c++, scylla-python34-pyparsing20 yaml-cpp-static pystache python-setuptools}
 Requires:       {{product}}-conf systemd-libs hwloc PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils python3-pyudev mdadm xfsprogs
 %{?rhel:Requires: python34 python34-PyYAML kernel >= 3.10.0-514}
 %{?fedora:Requires: python3 python3-PyYAML}
@@ -97,7 +97,7 @@ cflags="--cflags=${defines[*]}"
 %endif
 %if 0%{?rhel}
 . /etc/profile.d/scylla.sh
-python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
+python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --c-compiler=/opt/scylladb/bin/gcc-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
 %endif
 ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune

@@ -193,7 +193,6 @@ rm -rf $RPM_BUILD_ROOT
 %{_prefix}/lib/scylla/scylla_cpuscaling_setup
 %{_prefix}/lib/scylla/scylla_fstrim
 %{_prefix}/lib/scylla/scylla_fstrim_setup
-%{_prefix}/lib/scylla/posix_net_conf.sh
 %{_prefix}/lib/scylla/perftune.py
 %{_prefix}/lib/scylla/dpdk-devbind.py
 %{_prefix}/lib/scylla/hex2list.py
@@ -209,6 +208,7 @@ rm -rf $RPM_BUILD_ROOT
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/data
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/commitlog
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/hints
+%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/view_hints
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/coredump
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla-housekeeping
 %ghost /etc/systemd/system/scylla-server.service.d/
@@ -283,6 +283,7 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
+/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :

 %files kernel-conf
 %defattr(-,root,root)
--- a/docs/docker-hub.md
+++ b/docs/docker-hub.md
@@ -66,7 +66,7 @@ You can use Docker volumes to improve performance of Scylla.
 Create a Scylla data directory ``/var/lib/scylla`` on the host, which is used by Scylla container to store all data:

 ```console
-$ sudo mkdir -p /var/lib/scylla/data /var/lib/scylla/commitlog /var/lib/scylla/hints
+$ sudo mkdir -p /var/lib/scylla/data /var/lib/scylla/commitlog /var/lib/scylla/hints /var/lib/scylla/view_hints
 ```

 Launch Scylla using Docker's ``--volume`` command line option to mount the created host directory as a data volume in the container and disable Scylla's developer mode to run I/O tuning before starting up the Scylla node.
--- a/encoding_stats.hh
+++ b/encoding_stats.hh
@@ -41,12 +41,11 @@ struct encoding_stats {
    //        int DELETION_TIME_EPOCH = (int)(c.getTimeInMillis() / 1000); // local deletion times are in seconds
    // Encoding stats are used for delta-encoding, so we want some default values
    // that are just good enough so we take some recent date in the past
-    static constexpr uint32_t deletion_time_epoch = 1442880000;
+    static constexpr int32_t deletion_time_epoch = 1442880000;
    static constexpr api::timestamp_type timestamp_epoch = api::timestamp_type(deletion_time_epoch) * 1000 * 1000;
-    static constexpr uint32_t ttl_epoch = 0;
+    static constexpr int32_t ttl_epoch = 0;

    api::timestamp_type min_timestamp = timestamp_epoch;
-    uint32_t min_local_deletion_time = deletion_time_epoch;
-    uint32_t min_ttl = ttl_epoch;
+    int32_t min_local_deletion_time = deletion_time_epoch;
+    int32_t min_ttl = ttl_epoch;
 };
-
--- a/frozen_mutation.hh
+++ b/frozen_mutation.hh
@@ -78,6 +78,11 @@ public:

 frozen_mutation freeze(const mutation& m);

+struct frozen_mutation_and_schema {
+    frozen_mutation fm;
+    schema_ptr s;
+};
+
 // Can receive streamed_mutation in reversed order.
 class streamed_mutation_freezer {
    const schema& _schema;
--- a/gms/application_state.cc
+++ b/gms/application_state.cc
@@ -63,6 +63,8 @@ static const std::map<application_state, sstring> application_state_names = {
    {application_state::SUPPORTED_FEATURES,     "SUPPORTED_FEATURES"},
    {application_state::CACHE_HITRATES,         "CACHE_HITRATES"},
    {application_state::SCHEMA_TABLES_VERSION,  "SCHEMA_TABLES_VERSION"},
+    {application_state::RPC_READY,              "RPC_READY"},
+    {application_state::VIEW_BACKLOG,           "VIEW_BACKLOG"},
 };

 std::ostream& operator<<(std::ostream& os, const application_state& m) {
--- a/gms/application_state.hh
+++ b/gms/application_state.hh
@@ -60,9 +60,9 @@ enum class application_state {
    SUPPORTED_FEATURES,
    CACHE_HITRATES,
    SCHEMA_TABLES_VERSION,
+    RPC_READY,
+    VIEW_BACKLOG,
    // pad to allow adding new states to existing cluster
-    X4,
-    X5,
    X6,
    X7,
    X8,
--- a/gms/endpoint_state.cc
+++ b/gms/endpoint_state.cc
@@ -61,4 +61,16 @@ std::ostream& operator<<(std::ostream& os, const endpoint_state& x) {
    return os;
 }

+bool endpoint_state::is_cql_ready() const {
+    auto* app_state = get_application_state_ptr(application_state::RPC_READY);
+    if (!app_state) {
+        return false;
+    }
+    try {
+        return boost::lexical_cast<int>(app_state->value);
+    } catch (...) {
+        return false;
+    }
+}
+
 }
--- a/gms/endpoint_state.hh
+++ b/gms/endpoint_state.hh
@@ -129,26 +129,8 @@ public:
        update_is_normal();
    }

-    void apply_application_state(application_state key, versioned_value&& value) {
-        auto&& e = _application_state[key];
-        if (e.version < value.version) {
-            e = std::move(value);
-        }
-        update_is_normal();
-    }
-
-    void apply_application_state(application_state key, const versioned_value& value) {
-        auto&& e = _application_state[key];
-        if (e.version < value.version) {
-            e = value;
-        }
-        update_is_normal();
-    }
-
-    void apply_application_state(const endpoint_state& es) {
-        for (auto&& e : es._application_state) {
-            apply_application_state(e.first, e.second);
-        }
+    void add_application_state(const endpoint_state& es) {
+        _application_state = es._application_state;
        update_is_normal();
    }

@@ -208,6 +190,8 @@ public:
        _is_normal = get_status() == sstring(versioned_value::STATUS_NORMAL);
    }

+    bool is_cql_ready() const;
+
    friend std::ostream& operator<<(std::ostream& os, const endpoint_state& x);
 };

--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -930,7 +930,7 @@ void gossiper::make_random_gossip_digest(utils::chunked_vector<gossip_digest>& g
 future<> gossiper::replicate(inet_address ep, const endpoint_state& es) {
    return container().invoke_on_all([ep, es, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].apply_application_state(es);
+            g.endpoint_state_map[ep].add_application_state(es);
        }
    });
 }
@@ -939,7 +939,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
    return container().invoke_on_all([ep, &src, &changed, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
            for (auto&& key : changed) {
-                g.endpoint_state_map[ep].apply_application_state(key, src.at(key));
+                g.endpoint_state_map[ep].add_application_state(key, src.at(key));
            }
        }
    });
@@ -948,7 +948,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
 future<> gossiper::replicate(inet_address ep, application_state key, const versioned_value& value) {
    return container().invoke_on_all([ep, key, &value, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].apply_application_state(key, value);
+            g.endpoint_state_map[ep].add_application_state(key, value);
        }
    });
 }
@@ -1175,11 +1175,13 @@ stdx::optional<endpoint_state> gossiper::get_endpoint_state_for_endpoint(inet_ad
    }
 }

-void gossiper::reset_endpoint_state_map() {
-    endpoint_state_map.clear();
+future<> gossiper::reset_endpoint_state_map() {
    _unreachable_endpoints.clear();
    _live_endpoints.clear();
    _live_endpoints_just_added.clear();
+    return container().invoke_on_all([] (gossiper& g) {
+        g.endpoint_state_map.clear();
+    });
 }

 std::unordered_map<inet_address, endpoint_state>& gms::gossiper::get_endpoint_states() {
@@ -1191,6 +1193,25 @@ bool gossiper::uses_host_id(inet_address endpoint) {
            get_application_state_ptr(endpoint, application_state::NET_VERSION);
 }

+bool gossiper::is_cql_ready(const inet_address& endpoint) const {
+    // Note:
+    // - New scylla node always send application_state::RPC_READY = false when
+    // the node boots and send application_state::RPC_READY = true when cql
+    // server is up
+    // - Old scylla node that does not support the application_state::RPC_READY
+    // never has application_state::RPC_READY in the endpoint_state, we can
+    // only think their cql server is up, so we return true here if
+    // application_state::RPC_READY is not present
+    auto* eps = get_endpoint_state_for_endpoint_ptr(endpoint);
+    if (!eps) {
+        logger.debug("Node {} does not have RPC_READY application_state, return is_cql_ready=true", endpoint);
+        return true;
+    }
+    auto ready = eps->is_cql_ready();
+    logger.debug("Node {}: is_cql_ready={}",  endpoint, ready);
+    return ready;
+}
+
 utils::UUID gossiper::get_host_id(inet_address endpoint) {
    if (!uses_host_id(endpoint)) {
        throw std::runtime_error(sprint("Host %s does not use new-style tokens!", endpoint));
@@ -1298,6 +1319,14 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
 // Runs inside seastar::async context
 void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as alive {}", addr);
+
+    // Do not mark a node with status shutdown as UP.
+    auto status = get_gossip_status(local_state);
+    if (status == sstring(versioned_value::SHUTDOWN)) {
+        logger.warn("Skip marking node {} with status = {} as UP", addr, status);
+        return;
+    }
+
    local_state.mark_alive();
    local_state.update_timestamp(); // prevents do_status_check from racing us and evicting if it was down > A_VERY_LONG_TIME

@@ -1319,7 +1348,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    }

    if (!_in_shadow_round) {
-        logger.info("InetAddress {} is now UP, status = {}", addr, get_gossip_status(local_state));
+        logger.info("InetAddress {} is now UP, status = {}", addr, status);
    }

    _subscribers.for_each([addr, local_state] (auto& subscriber) {
@@ -1662,6 +1691,7 @@ void gossiper::maybe_initialize_local_state(int generation_nbr) {
    }
 }

+// Runs inside seastar::async context
 void gossiper::add_saved_endpoint(inet_address ep) {
    if (ep == get_broadcast_address()) {
        logger.debug("Attempt to add self as saved endpoint");
@@ -1687,6 +1717,7 @@ void gossiper::add_saved_endpoint(inet_address ep) {
    }
    ep_state.mark_dead();
    endpoint_state_map[ep] = ep_state;
+    replicate(ep, ep_state).get();
    _unreachable_endpoints[ep] = now();
    logger.trace("Adding saved endpoint {} {}", ep, ep_state.get_heart_beat_state().get_generation());
 }
@@ -1924,6 +1955,7 @@ void gossiper::mark_as_shutdown(const inet_address& endpoint) {
        auto& ep_state = *es;
        ep_state.add_application_state(application_state::STATUS, storage_service_value_factory().shutdown(true));
        ep_state.get_heart_beat_state().force_highest_possible_version_unsafe();
+        replicate(endpoint, ep_state).get();
        mark_dead(endpoint, ep_state);
        get_local_failure_detector().force_conviction(endpoint);
    }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -417,7 +417,7 @@ public:
    stdx::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep) const;

    // removes ALL endpoint states; should only be called after shadow gossip
-    void reset_endpoint_state_map();
+    future<> reset_endpoint_state_map();

    std::unordered_map<inet_address, endpoint_state>& get_endpoint_states();

@@ -548,6 +548,7 @@ public:
    bool is_seed(const inet_address& endpoint) const;
    bool is_shutdown(const inet_address& endpoint) const;
    bool is_normal(const inet_address& endpoint) const;
+    bool is_cql_ready(const inet_address& endpoint) const;
    bool is_silent_shutdown_state(const endpoint_state& ep_state) const;
    void mark_as_shutdown(const inet_address& endpoint);
    void force_newer_generation();
--- a/gms/versioned_value.hh
+++ b/gms/versioned_value.hh
@@ -246,6 +246,9 @@ public:
            return versioned_value(hitrates);
        }

+        versioned_value cql_ready(bool value) {
+            return versioned_value(to_sstring(int(value)));
+        }
    };
 }; // class versioned_value

--- a/idl/streaming.idl.hh
+++ b/idl/streaming.idl.hh
@@ -42,4 +42,13 @@ class prepare_message {
    uint32_t dst_cpu_id;
 };

+enum class stream_reason : uint8_t {
+    unspecified,
+    bootstrap,
+    decommission,
+    removenode,
+    rebuild,
+    repair,
+};
+
 }
--- a/idl/view.idl.hh
+++ b/idl/view.idl.hh
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+namespace db {
+namespace view {
+class update_backlog {
+    size_t current;
+    size_t max;
+};
+}
+}
--- a/init.cc
+++ b/init.cc
@@ -100,6 +100,8 @@ void init_ms_fd_gossiper(sstring listen_address_in
            creds->set_x509_trust_file(ms_trust_store, x509_crt_format::PEM).get();
        }

+        creds->set_priority_string(db::config::default_tls_priority);
+
        if (!ms_tls_prio.empty()) {
            creds->set_priority_string(ms_tls_prio);
        }
--- a/install.sh
+++ b/install.sh
@@ -93,7 +93,6 @@ install -m644 build/*.service -Dt "$rprefix"/lib/systemd/system
 install -m644 dist/common/systemd/*.service -Dt "$rprefix"/lib/systemd/system
 install -m644 dist/common/systemd/*.timer -Dt "$rprefix"/lib/systemd/system
 install -m755 dist/common/scripts/* -Dt "$rprefix"/lib/scylla/
-install -m755 seastar/scripts/posix_net_conf.sh "$rprefix"/lib/scylla/
 install -m755 seastar/scripts/perftune.py -Dt "$rprefix"/lib/scylla/
 install -m755 seastar/dpdk/usertools/dpdk-devbind.py -Dt "$rprefix"/lib/scylla/
 install -m755 build/release/scylla -Dt "$rprefix/bin"
@@ -116,6 +115,7 @@ install -m755 -d "$root"/var/lib/scylla/
 install -m755 -d "$root"/var/lib/scylla/data
 install -m755 -d "$root"/var/lib/scylla/commitlog
 install -m755 -d "$root"/var/lib/scylla/hints
+install -m755 -d "$root"/var/lib/scylla/view_hints
 install -m755 -d "$root"/var/lib/scylla/coredump
 install -m755 -d "$root"/var/lib/scylla-housekeeping
 install -m755 -d "$rprefix"/lib/scylla/swagger-ui
--- a/1
+++ b/1
--- a/Show More
+++ b/Show More