release: prepare for 2.0.0

Signed-off-by: Shlomi Livne <shlomi@scylladb.com>
Update seastar submodule
2017-09-29 21:12:18 +03:00 · 2017-09-28 15:32:26 +02:00 · 2017-09-28 15:03:12 +02:00 · 2017-09-27 12:06:45 +01:00 · 2017-09-27 12:06:37 +01:00 · 2017-09-26 19:18:29 +02:00
140 changed files with 3496 additions and 1149 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=2.0.0

 if test -f version
 then
--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -252,13 +252,13 @@ void set_cache_service(http_context& ctx, routes& r) {
        // In origin row size is the weighted size.
        // We currently do not support weights, so we use num entries instead
        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().num_entries();
+            return cf.get_row_cache().partitions();
        }, std::plus<uint64_t>());
    });

    cs::get_row_entries.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().num_entries();
+            return cf.get_row_cache().partitions();
        }, std::plus<uint64_t>());
    });

--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -130,7 +130,7 @@ public:
                        }) {}

    future<> stop() {
-        return make_ready_future<>();
+        return _cache.stop();
    }

    future<permission_set> get(::shared_ptr<authenticated_user> user, data_resource resource) {
--- a/cache_streamed_mutation.hh
+++ b/cache_streamed_mutation.hh
@@ -69,6 +69,29 @@ public:
 };

 class cache_streamed_mutation final : public streamed_mutation::impl {
+    enum class state {
+        before_static_row,
+
+        // Invariants:
+        //  - position_range(_lower_bound, _upper_bound) covers all not yet emitted positions from current range
+        //  - _next_row points to the nearest row in cache >= _lower_bound
+        //  - _next_row_in_range = _next.position() < _upper_bound
+        reading_from_cache,
+
+        // Starts reading from underlying reader.
+        // The range to read is position_range(_lower_bound, min(_next_row.position(), _upper_bound)).
+        // Invariants:
+        //  - _next_row_in_range = _next.position() < _upper_bound
+        move_to_underlying,
+
+        // Invariants:
+        // - Upper bound of the read is min(_next_row.position(), _upper_bound)
+        // - _next_row_in_range = _next.position() < _upper_bound
+        // - _last_row_key contains the key of last emitted clustering_row
+        reading_from_underlying,
+
+        end_of_stream
+    };
    lw_shared_ptr<partition_snapshot> _snp;
    position_in_partition::tri_compare _position_cmp;

@@ -92,25 +115,24 @@ class cache_streamed_mutation final : public streamed_mutation::impl {
    position_in_partition _lower_bound;
    position_in_partition_view _upper_bound;

-    bool _static_row_done = false;
-    bool _reading_underlying = false;
+    state _state = state::before_static_row;
    lw_shared_ptr<read_context> _read_context;
    partition_snapshot_row_cursor _next_row;
    bool _next_row_in_range = false;

    future<> do_fill_buffer();
-    future<> copy_from_cache_to_buffer();
+    void copy_from_cache_to_buffer();
    future<> process_static_row();
    void move_to_end();
-    future<> move_to_next_range();
-    future<> move_to_current_range();
-    future<> move_to_next_entry();
+    void move_to_next_range();
+    void move_to_current_range();
+    void move_to_next_entry();
    // Emits all delayed range tombstones with positions smaller than upper_bound.
    void drain_tombstones(position_in_partition_view upper_bound);
    // Emits all delayed range tombstones.
    void drain_tombstones();
    void add_to_buffer(const partition_snapshot_row_cursor&);
-    void add_to_buffer(clustering_row&&);
+    void add_clustering_row_to_buffer(mutation_fragment&&);
    void add_to_buffer(range_tombstone&&);
    void add_to_buffer(mutation_fragment&&);
    future<> read_from_underlying();
@@ -154,12 +176,14 @@ public:
 inline
 future<> cache_streamed_mutation::process_static_row() {
    if (_snp->version()->partition().static_row_continuous()) {
+        _read_context->cache().on_row_hit();
        row sr = _snp->static_row();
        if (!sr.empty()) {
            push_mutation_fragment(mutation_fragment(static_row(std::move(sr))));
        }
        return make_ready_future<>();
    } else {
+        _read_context->cache().on_row_miss();
        return _read_context->get_next_fragment().then([this] (mutation_fragment_opt&& sr) {
            if (sr) {
                assert(sr->is_static_row());
@@ -173,15 +197,24 @@ future<> cache_streamed_mutation::process_static_row() {

 inline
 future<> cache_streamed_mutation::fill_buffer() {
-    if (!_static_row_done) {
-        _static_row_done = true;
-        return process_static_row().then([this] {
-            return _lsa_manager.run_in_read_section([this] {
-                return move_to_current_range();
-            }).then([this] {
-                return fill_buffer();
+    if (_state == state::before_static_row) {
+        auto after_static_row = [this] {
+            if (_ck_ranges_curr == _ck_ranges_end) {
+                _end_of_stream = true;
+                _state = state::end_of_stream;
+                return make_ready_future<>();
+            }
+            _state = state::reading_from_cache;
+            _lsa_manager.run_in_read_section([this] {
+                move_to_current_range();
            });
-        });
+            return fill_buffer();
+        };
+        if (_schema->has_static_columns()) {
+            return process_static_row().then(std::move(after_static_row));
+        } else {
+            return after_static_row();
+        }
    }
    return do_until([this] { return _end_of_stream || is_buffer_full(); }, [this] {
        return do_fill_buffer();
@@ -190,18 +223,27 @@ future<> cache_streamed_mutation::fill_buffer() {

 inline
 future<> cache_streamed_mutation::do_fill_buffer() {
-    if (_reading_underlying) {
+    if (_state == state::move_to_underlying) {
+        _state = state::reading_from_underlying;
+        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
+                                      : position_in_partition(_upper_bound);
+        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}).then([this] {
+            return read_from_underlying();
+        });
+    }
+    if (_state == state::reading_from_underlying) {
        return read_from_underlying();
    }
+    // assert(_state == state::reading_from_cache)
    return _lsa_manager.run_in_read_section([this] {
        auto same_pos = _next_row.maybe_refresh();
        // FIXME: If continuity changed anywhere between _lower_bound and _next_row.position()
        // we need to redo the lookup with _lower_bound. There is no eviction yet, so not yet a problem.
        assert(same_pos);
-        while (!is_buffer_full() && !_end_of_stream && !_reading_underlying) {
-            future<> f = copy_from_cache_to_buffer();
-            if (!f.available() || need_preempt()) {
-                return f;
+        while (!is_buffer_full() && _state == state::reading_from_cache) {
+            copy_from_cache_to_buffer();
+            if (need_preempt()) {
+                break;
            }
        }
        return make_ready_future<>();
@@ -210,33 +252,34 @@ future<> cache_streamed_mutation::do_fill_buffer() {

 inline
 future<> cache_streamed_mutation::read_from_underlying() {
-    return do_until([this] { return !_reading_underlying || is_buffer_full(); }, [this] {
-        return _read_context->get_next_fragment().then([this] (auto&& mfopt) {
-            if (!mfopt) {
-                _reading_underlying = false;
-                return _lsa_manager.run_in_update_section([this] {
-                    auto same_pos = _next_row.maybe_refresh();
-                    assert(same_pos); // FIXME: handle eviction
-                    if (_next_row_in_range) {
+    return consume_mutation_fragments_until(_read_context->get_streamed_mutation(),
+        [this] { return _state != state::reading_from_underlying || is_buffer_full(); },
+        [this] (mutation_fragment mf) {
+            _read_context->cache().on_row_miss();
+            maybe_add_to_cache(mf);
+            add_to_buffer(std::move(mf));
+        },
+        [this] {
+            _state = state::reading_from_cache;
+            _lsa_manager.run_in_update_section([this] {
+                auto same_pos = _next_row.maybe_refresh();
+                assert(same_pos); // FIXME: handle eviction
+                if (_next_row_in_range) {
+                    maybe_update_continuity();
+                    add_to_buffer(_next_row);
+                    move_to_next_entry();
+                } else {
+                    if (no_clustering_row_between(*_schema, _upper_bound, _next_row.position())) {
                        this->maybe_update_continuity();
-                        this->add_to_buffer(_next_row);
-                        return this->move_to_next_entry();
                    } else {
-                        if (no_clustering_row_between(*_schema, _upper_bound, _next_row.position())) {
-                            this->maybe_update_continuity();
-                        } else {
-                            // FIXME: Insert dummy entry at _upper_bound.
-                        }
-                        return this->move_to_next_range();
+                        // FIXME: Insert dummy entry at _upper_bound.
+                        _read_context->cache().on_mispopulate();
                    }
-                });
-            } else {
-                this->maybe_add_to_cache(*mfopt);
-                this->add_to_buffer(std::move(*mfopt));
-                return make_ready_future<>();
-            }
+                    move_to_next_range();
+                }
+            });
+            return make_ready_future<>();
        });
-    });
 }

 inline
@@ -249,6 +292,8 @@ void cache_streamed_mutation::maybe_update_continuity() {
        } else if (!_ck_ranges_curr->start()) {
            _next_row.set_continuous(true);
        }
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

@@ -266,6 +311,7 @@ void cache_streamed_mutation::maybe_add_to_cache(const mutation_fragment& mf) {
 inline
 void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
    if (!can_populate()) {
+        _read_context->cache().on_mispopulate();
        return;
    }
    _lsa_manager.run_in_update_section_with_allocator([this, &cr] {
@@ -281,10 +327,11 @@ void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
        auto new_entry = alloc_strategy_unique_ptr<rows_entry>(
            current_allocator().construct<rows_entry>(cr.key(), cr.tomb(), cr.marker(), cr.cells()));
        new_entry->set_continuous(false);
-        auto it = _next_row.has_up_to_date_row_from_latest_version()
+        auto it = _next_row.has_valid_row_from_latest_version()
                  ? _next_row.get_iterator_in_latest_version() : mp.clustered_rows().lower_bound(cr.key(), less);
        auto insert_result = mp.clustered_rows().insert_check(it, *new_entry, less);
        if (insert_result.second) {
+            _read_context->cache().on_row_insert();
            new_entry.release();
        }
        it = insert_result.first;
@@ -294,11 +341,12 @@ void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
            if (it == mp.clustered_rows().begin()) {
                // FIXME: check whether entry for _last_row_key is in older versions and if so set
                // continuity to true.
+                _read_context->cache().on_mispopulate();
            } else {
                auto prev_it = it;
                --prev_it;
-                clustering_key_prefix::tri_compare tri_comp(*_schema);
-                if (tri_comp(*_last_row_key, prev_it->key()) == 0) {
+                clustering_key_prefix::equality eq(*_schema);
+                if (eq(*_last_row_key, prev_it->key())) {
                    e.set_continuous(true);
                }
            }
@@ -306,6 +354,7 @@ void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
            e.set_continuous(true);
        } else {
            // FIXME: Insert dummy entry at _ck_ranges_curr->start()
+            _read_context->cache().on_mispopulate();
        }
    });
 }
@@ -317,26 +366,24 @@ bool cache_streamed_mutation::after_current_range(position_in_partition_view p)

 inline
 future<> cache_streamed_mutation::start_reading_from_underlying() {
-    _reading_underlying = true;
-    auto end = _next_row_in_range ? position_in_partition(_next_row.position())
-                                  : position_in_partition(_upper_bound);
-    return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)});
+    _state = state::move_to_underlying;
+    return make_ready_future<>();
 }

 inline
-future<> cache_streamed_mutation::copy_from_cache_to_buffer() {
+void cache_streamed_mutation::copy_from_cache_to_buffer() {
    position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key());
    for (auto&& rts : _snp->range_tombstones(*_schema, _lower_bound, _next_row_in_range ? next_lower_bound : _upper_bound)) {
        add_to_buffer(std::move(rts));
        if (is_buffer_full()) {
-            return make_ready_future<>();
+            return;
        }
    }
    if (_next_row_in_range) {
        add_to_buffer(_next_row);
-        return move_to_next_entry();
+        move_to_next_entry();
    } else {
-        return move_to_next_range();
+        move_to_next_range();
    }
 }

@@ -344,47 +391,45 @@ inline
 void cache_streamed_mutation::move_to_end() {
    drain_tombstones();
    _end_of_stream = true;
+    _state = state::end_of_stream;
 }

 inline
-future<> cache_streamed_mutation::move_to_next_range() {
+void cache_streamed_mutation::move_to_next_range() {
    ++_ck_ranges_curr;
    if (_ck_ranges_curr == _ck_ranges_end) {
        move_to_end();
-        return make_ready_future<>();
    } else {
-        return move_to_current_range();
+        move_to_current_range();
    }
 }

 inline
-future<> cache_streamed_mutation::move_to_current_range() {
+void cache_streamed_mutation::move_to_current_range() {
    _last_row_key = std::experimental::nullopt;
    _lower_bound = position_in_partition::for_range_start(*_ck_ranges_curr);
    _upper_bound = position_in_partition_view::for_range_end(*_ck_ranges_curr);
    auto complete_until_next = _next_row.advance_to(_lower_bound) || _next_row.continuous();
    _next_row_in_range = !after_current_range(_next_row.position());
    if (!complete_until_next) {
-        return start_reading_from_underlying();
+        start_reading_from_underlying();
    }
-    return make_ready_future<>();
 }

 // _next_row must be inside the range.
 inline
-future<> cache_streamed_mutation::move_to_next_entry() {
+void cache_streamed_mutation::move_to_next_entry() {
    if (no_clustering_row_between(*_schema, _next_row.position(), _upper_bound)) {
-        return move_to_next_range();
+        move_to_next_range();
    } else {
        if (!_next_row.next()) {
            move_to_end();
-            return make_ready_future<>();
+            return;
        }
        _next_row_in_range = !after_current_range(_next_row.position());
        if (!_next_row.continuous()) {
-            return start_reading_from_underlying();
+            start_reading_from_underlying();
        }
-        return make_ready_future<>();
    }
 }

@@ -405,7 +450,7 @@ void cache_streamed_mutation::drain_tombstones() {
 inline
 void cache_streamed_mutation::add_to_buffer(mutation_fragment&& mf) {
    if (mf.is_clustering_row()) {
-        add_to_buffer(std::move(std::move(mf).as_clustering_row()));
+        add_clustering_row_to_buffer(std::move(mf));
    } else {
        assert(mf.is_range_tombstone());
        add_to_buffer(std::move(mf).as_range_tombstone());
@@ -415,16 +460,18 @@ void cache_streamed_mutation::add_to_buffer(mutation_fragment&& mf) {
 inline
 void cache_streamed_mutation::add_to_buffer(const partition_snapshot_row_cursor& row) {
    if (!row.dummy()) {
-        add_to_buffer(row.row());
+        _read_context->cache().on_row_hit();
+        add_clustering_row_to_buffer(row.row());
    }
 }

 inline
-void cache_streamed_mutation::add_to_buffer(clustering_row&& row) {
+void cache_streamed_mutation::add_clustering_row_to_buffer(mutation_fragment&& mf) {
+    auto& row = mf.as_clustering_row();
    drain_tombstones(row.position());
    _last_row_key = row.key();
    _lower_bound = position_in_partition::after_key(row.key());
-    push_mutation_fragment(std::move(row));
+    push_mutation_fragment(std::move(mf));
 }

 inline
@@ -444,15 +491,20 @@ void cache_streamed_mutation::maybe_add_to_cache(const range_tombstone& rt) {
        _lsa_manager.run_in_update_section_with_allocator([&] {
            _snp->version()->partition().apply_row_tombstone(*_schema, rt);
        });
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

 inline
 void cache_streamed_mutation::maybe_add_to_cache(const static_row& sr) {
    if (can_populate()) {
+        _read_context->cache().on_row_insert();
        _lsa_manager.run_in_update_section_with_allocator([&] {
            _snp->version()->partition().static_row().apply(*_schema, column_kind::static_column, sr.cells());
        });
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

@@ -460,6 +512,8 @@ inline
 void cache_streamed_mutation::maybe_set_static_row_continuous() {
    if (can_populate()) {
        _snp->version()->partition().set_static_row_continuous(true);
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -43,10 +43,14 @@ private:
    bool advance_to_next_range() {
        _in_current = false;
        if (!_current_start.is_static_row()) {
+            if (_current == _end) {
+                return false;
+            }
            ++_current;
        }
        ++_change_counter;
        if (_current == _end) {
+            _current_end = _current_start = position_in_partition_view::after_all_clustered_rows();
            return false;
        }
        _current_start = position_in_partition_view::for_range_start(*_current);
@@ -61,11 +65,18 @@ public:
        , _end(ranges.end())
        , _in_current(with_static_row)
        , _with_static_row(with_static_row)
-        , _current_start(with_static_row ? position_in_partition_view::for_static_row()
-                                         : position_in_partition_view::for_range_start(*_current))
-        , _current_end(with_static_row ? position_in_partition_view::before_all_clustered_rows()
-                                       : position_in_partition_view::for_range_end(*_current))
-    { }
+        , _current_start(position_in_partition_view::for_static_row())
+        , _current_end(position_in_partition_view::before_all_clustered_rows())
+    {
+        if (!with_static_row) {
+            if (_current == _end) {
+                _current_start = _current_end = position_in_partition_view::after_all_clustered_rows();
+            } else {
+                _current_start = position_in_partition_view::for_range_start(*_current);
+                _current_end = position_in_partition_view::for_range_end(*_current);
+            }
+        }
+    }
    clustering_ranges_walker(clustering_ranges_walker&& o) noexcept
        : _schema(o._schema)
        , _ranges(o._ranges)
@@ -94,10 +105,6 @@ public:
    void trim_front(position_in_partition pos) {
        position_in_partition::less_compare less(_schema);

-        if (_current == _end) {
-            return;
-        }
-
        do {
            if (!less(_current_start, pos)) {
                break;
@@ -118,10 +125,6 @@ public:
    bool advance_to(position_in_partition_view pos) {
        position_in_partition::less_compare less(_schema);

-        if (_current == _end) {
-            return false;
-        }
-
        do {
            if (!_in_current && less(pos, _current_start)) {
                break;
@@ -146,12 +149,8 @@ public:
    bool advance_to(position_in_partition_view start, position_in_partition_view end) {
        position_in_partition::less_compare less(_schema);

-        if (_current == _end) {
-            return false;
-        }
-
        do {
-            if (less(end, _current_start)) {
+            if (!less(_current_start, end)) {
                break;
            }
            if (less(start, _current_end)) {
@@ -192,7 +191,7 @@ public:

    // Returns true if advanced past all contained positions. Any later advance_to() until reset() will return false.
    bool out_of_range() const {
-        return _current == _end;
+        return !_in_current && _current == _end;
    }

    // Resets the state of the walker so that advance_to() can be now called for new sequence of positions.
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -499,14 +499,15 @@ public:
            , _is_compound(true)
    { }

-    std::vector<bytes> explode() const {
+    std::vector<bytes_view> explode() const {
        if (!_is_compound) {
-            return { to_bytes(_bytes) };
+            return { _bytes };
        }

-        std::vector<bytes> ret;
+        std::vector<bytes_view> ret;
+        ret.reserve(8);
        for (auto it = begin(), e = end(); it != e; ) {
-            ret.push_back(to_bytes(it->first));
+            ret.push_back(it->first);
            auto marker = it->second;
            ++it;
            if (it != e && marker != composite::eoc::none) {
--- a/configure.py
+++ b/configure.py
@@ -34,7 +34,7 @@ for line in open('/etc/os-release'):
        os_ids += value.split(' ')

 # distribution "internationalization", converting package names.
-# Fedora name is key, values is distro -> package name dict. 
+# Fedora name is key, values is distro -> package name dict.
 i18n_xlat = {
    'boost-devel': {
        'debian': 'libboost-dev',
@@ -48,7 +48,7 @@ def pkgname(name):
        for id in os_ids:
            if id in dict:
                return dict[id]
-    return name 
+    return name

 def get_flags():
    with open('/proc/cpuinfo') as f:
@@ -175,6 +175,7 @@ scylla_tests = [
    'tests/keys_test',
    'tests/partitioner_test',
    'tests/frozen_mutation_test',
+    'tests/clustering_ranges_walker_test',
    'tests/perf/perf_mutation',
    'tests/lsa_async_eviction_test',
    'tests/lsa_sync_eviction_test',
@@ -640,7 +641,7 @@ for t in tests_not_using_seastar_test_framework:
 for t in scylla_tests:
    deps[t] = [t + '.cc']
    if t not in tests_not_using_seastar_test_framework:
-        deps[t] += scylla_tests_dependencies 
+        deps[t] += scylla_tests_dependencies
        deps[t] += scylla_tests_seastar_deps
    else:
        deps[t] += scylla_core + api + idls + ['tests/cql_test_env.cc']
@@ -918,7 +919,7 @@ with open(buildfile, 'w') as f:
                if binary.startswith('tests/'):
                    local_libs = '$libs'
                    if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
-                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework') 
+                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
                    if has_thrift:
                        local_libs += ' ' + thrift_libs + ' ' + maybe_static(args.staticboost, '-lboost_system')
                    # Our code's debugging information is huge, and multiplied
--- a/counters.cc
+++ b/counters.cc
@@ -29,6 +29,15 @@ counter_id counter_id::local()
    return counter_id(service::get_local_storage_service().get_local_id());
 }

+bool counter_id::less_compare_1_7_4::operator()(const counter_id& a, const counter_id& b) const
+{
+    if (a._most_significant != b._most_significant) {
+        return a._most_significant < b._most_significant;
+    } else {
+        return a._least_significant < b._least_significant;
+    }
+}
+
 std::ostream& operator<<(std::ostream& os, const counter_id& id) {
    return os << id.to_uuid();
 }
@@ -42,6 +51,33 @@ std::ostream& operator<<(std::ostream& os, counter_cell_view ccv) {
    return os << "{counter_cell timestamp: " << ccv.timestamp() << " shards: {" << ::join(", ", ccv.shards()) << "}}";
 }

+void counter_cell_builder::do_sort_and_remove_duplicates()
+{
+    boost::range::sort(_shards, [] (auto& a, auto& b) { return a.id() < b.id(); });
+
+    std::vector<counter_shard> new_shards;
+    new_shards.reserve(_shards.size());
+    for (auto& cs : _shards) {
+        if (new_shards.empty() || new_shards.back().id() != cs.id()) {
+            new_shards.emplace_back(cs);
+        } else {
+            new_shards.back().apply(cs);
+        }
+    }
+    _shards = std::move(new_shards);
+    _sorted = true;
+}
+
+std::vector<counter_shard> counter_cell_view::shards_compatible_with_1_7_4() const
+{
+    auto sorted_shards = boost::copy_range<std::vector<counter_shard>>(shards());
+    counter_id::less_compare_1_7_4 cmp;
+    boost::range::sort(sorted_shards, [&] (auto& a, auto& b) {
+        return cmp(a.id(), b.id());
+    });
+    return sorted_shards;
+}
+
 static bool apply_in_place(atomic_cell_or_collection& dst, atomic_cell_or_collection& src)
 {
    auto dst_ccmv = counter_cell_mutable_view(dst.as_mutable_atomic_cell());
--- a/counters.hh
+++ b/counters.hh
@@ -36,6 +36,10 @@ class counter_id {
    int64_t _least_significant;
    int64_t _most_significant;
 public:
+    static_assert(std::is_same<decltype(std::declval<utils::UUID>().get_least_significant_bits()), int64_t>::value
+            &&  std::is_same<decltype(std::declval<utils::UUID>().get_most_significant_bits()), int64_t>::value,
+        "utils::UUID is expected to work with two signed 64-bit integers");
+
    counter_id() = default;
    explicit counter_id(utils::UUID uuid) noexcept
        : _least_significant(uuid.get_least_significant_bits())
@@ -49,12 +53,20 @@ public:
    bool operator<(const counter_id& other) const {
        return to_uuid() < other.to_uuid();
    }
+    bool operator>(const counter_id& other) const {
+        return other.to_uuid() < to_uuid();
+    }
    bool operator==(const counter_id& other) const {
        return to_uuid() == other.to_uuid();
    }
    bool operator!=(const counter_id& other) const {
        return !(*this == other);
    }
+public:
+    // (Wrong) Counter ID ordering used by Scylla 1.7.4 and earlier.
+    struct less_compare_1_7_4 {
+        bool operator()(const counter_id& a, const counter_id& b) const;
+    };
 public:
    static counter_id local();

@@ -139,6 +151,22 @@ private:
    static void write(const T& value, bytes::iterator& out) {
        out = std::copy_n(reinterpret_cast<const signed char*>(&value), sizeof(T), out);
    }
+private:
+    // Shared logic for applying counter_shards and counter_shard_views.
+    // T is either counter_shard or basic_counter_shard_view<U>.
+    template<typename T>
+    GCC6_CONCEPT(requires requires(T shard) {
+        { shard.value() } -> int64_t;
+        { shard.logical_clock() } -> int64_t;
+    })
+    counter_shard& do_apply(T&& other) noexcept {
+        auto other_clock = other.logical_clock();
+        if (_logical_clock < other_clock) {
+            _logical_clock = other_clock;
+            _value = other.value();
+        }
+        return *this;
+    }
 public:
    counter_shard(counter_id id, int64_t value, int64_t logical_clock) noexcept
        : _id(id)
@@ -163,12 +191,11 @@ public:
    }

    counter_shard& apply(counter_shard_view other) noexcept {
-        auto other_clock = other.logical_clock();
-        if (_logical_clock < other_clock) {
-            _logical_clock = other_clock;
-            _value = other.value();
-        }
-        return *this;
+        return do_apply(other);
+    }
+
+    counter_shard& apply(const counter_shard& other) noexcept {
+        return do_apply(other);
    }

    static size_t serialized_size() {
@@ -183,6 +210,9 @@ public:

 class counter_cell_builder {
    std::vector<counter_shard> _shards;
+    bool _sorted = true;
+private:
+    void do_sort_and_remove_duplicates();
 public:
    counter_cell_builder() = default;
    counter_cell_builder(size_t shard_count) {
@@ -193,6 +223,21 @@ public:
        _shards.emplace_back(cs);
    }

+    void add_maybe_unsorted_shard(const counter_shard& cs) {
+        add_shard(cs);
+        if (_sorted && _shards.size() > 1) {
+            auto current = _shards.rbegin();
+            auto previous = std::next(current);
+            _sorted = current->id() > previous->id();
+        }
+    }
+
+    void sort_and_remove_duplicates() {
+        if (!_sorted) {
+            do_sort_and_remove_duplicates();
+        }
+    }
+
    size_t serialized_size() const {
        return _shards.size() * counter_shard::serialized_size();
    }
@@ -339,6 +384,9 @@ public:
 struct counter_cell_view : basic_counter_cell_view<bytes_view> {
    using basic_counter_cell_view::basic_counter_cell_view;

+    // Returns counter shards in an order that is compatible with Scylla 1.7.4.
+    std::vector<counter_shard> shards_compatible_with_1_7_4() const;
+
    // Reversibly applies two counter cells, at least one of them must be live.
    // Returns true iff dst was modified.
    static bool apply_reversibly(atomic_cell_or_collection& dst, atomic_cell_or_collection& src);
--- a/cpu_controller.hh
+++ b/cpu_controller.hh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <seastar/core/thread.hh>
+#include <seastar/core/timer.hh>
+#include <chrono>
+
+// Simple proportional controller to adjust shares of memtable/streaming flushes.
+//
+// Goal is to flush as fast as we can, but not so fast that we steal all the CPU from incoming
+// requests, and at the same time minimize user-visible fluctuations in the flush quota.
+//
+// What that translates to is we'll try to keep virtual dirty's firt derivative at 0 (IOW, we keep
+// virtual dirty constant), which means that the rate of incoming writes is equal to the rate of
+// flushed bytes.
+//
+// The exact point at which the controller stops determines the desired flush CPU usage. As we
+// approach the hard dirty limit, we need to be more aggressive. We will therefore define two
+// thresholds, and increase the constant as we cross them.
+//
+//  1) the soft limit line
+//  2) halfway between soft limit and dirty limit
+//
+// The constants q1 and q2 are used to determine the proportional factor at each stage.
+//
+// Below the soft limit, we are in no particular hurry to flush, since it means we're set to
+// complete flushing before we a new memtable is ready. The quota is dirty * q1, and q1 is set to a
+// low number.
+//
+// The first half of the virtual dirty region is where we expect to be usually, so we have a low
+// slope corresponding to a sluggish response between q1 * soft_limit and q2.
+//
+// In the second half, we're getting close to the hard dirty limit so we increase the slope and
+// become more responsive, up to a maximum quota of qmax.
+//
+// For now we'll just set them in the structure not to complicate the constructor. But q1, q2 and
+// qmax can easily become parameters if we find another user.
+class flush_cpu_controller {
+    static constexpr float hard_dirty_limit = 0.50;
+    static constexpr float q1 = 0.01;
+    static constexpr float q2 = 0.2;
+    static constexpr float qmax = 1;
+
+    float _current_quota = 0.0f;
+    float _goal;
+    std::function<float()> _current_dirty;
+    std::chrono::milliseconds _interval;
+    timer<> _update_timer;
+
+    seastar::thread_scheduling_group _scheduling_group;
+    seastar::thread_scheduling_group *_current_scheduling_group = nullptr;
+
+    void adjust();
+public:
+    seastar::thread_scheduling_group* scheduling_group() {
+        return _current_scheduling_group;
+    }
+    float current_quota() const {
+        return _current_quota;
+    }
+
+    struct disabled {
+        seastar::thread_scheduling_group *backup;
+    };
+    flush_cpu_controller(disabled d) : _scheduling_group(std::chrono::nanoseconds(0), 0), _current_scheduling_group(d.backup) {}
+    flush_cpu_controller(std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty);
+    flush_cpu_controller(flush_cpu_controller&&) = default;
+};
+
+
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -1550,6 +1550,8 @@ basic_unreserved_keyword returns [sstring str]
        | K_DISTINCT
        | K_CONTAINS
        | K_STATIC
+        | K_FROZEN
+        | K_TUPLE
        | K_FUNCTION
        | K_AGGREGATE
        | K_SFUNC
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -75,6 +75,10 @@ functions::init() {
    declare(aggregate_fcts::make_max_function<double>());
    declare(aggregate_fcts::make_min_function<double>());

+    declare(aggregate_fcts::make_count_function<sstring>());
+    declare(aggregate_fcts::make_max_function<sstring>());
+    declare(aggregate_fcts::make_min_function<sstring>());
+
    //FIXME:
    //declare(aggregate_fcts::make_count_function<bytes>());
    //declare(aggregate_fcts::make_max_function<bytes>());
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -120,7 +120,7 @@ public:
                if (restriction->is_slice()) {
                    throw exceptions::invalid_request_exception(sprint(
                        "PRIMARY KEY column \"%s\" cannot be restricted (preceding column \"%s\" is restricted by a non-EQ relation)",
-                        _restrictions->next_column(new_column)->name_as_text(), new_column.name_as_text()));
+                        last_column.name_as_text(), new_column.name_as_text()));
                }
            }

--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -63,7 +63,7 @@ void cql3::statements::alter_keyspace_statement::validate(distributed<service::s
        service::get_local_storage_proxy().get_db().local().find_keyspace(_name); // throws on failure
        auto tmp = _name;
        std::transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower);
-        if (tmp == db::system_keyspace::NAME) {
+        if (is_system_keyspace(tmp)) {
            throw exceptions::invalid_request_exception("Cannot alter system keyspace");
        }

--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -41,6 +41,8 @@

 #include "cql3/statements/cf_prop_defs.hh"

+#include <boost/algorithm/string/predicate.hpp>
+
 namespace cql3 {

 namespace statements {
@@ -65,6 +67,8 @@ const sstring cf_prop_defs::KW_CRC_CHECK_CHANCE = "crc_check_chance";

 const sstring cf_prop_defs::COMPACTION_STRATEGY_CLASS_KEY = "class";

+const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";
+
 void cf_prop_defs::validate() {
    // Skip validation if the comapction strategy class is already set as it means we've alreayd
    // prepared (and redoing it would set strategyClass back to null, which we don't want)
@@ -188,6 +192,13 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder) {
    builder.set_min_compaction_threshold(min_compaction_threshold);
    builder.set_max_compaction_threshold(max_compaction_threshold);

+    if (has_property(KW_COMPACTION)) {
+        if (get_compaction_options().count(COMPACTION_ENABLED_KEY)) {
+            auto enabled = boost::algorithm::iequals(get_compaction_options().at(COMPACTION_ENABLED_KEY), "true");
+            builder.set_compaction_enabled(enabled);
+        }
+    }
+
    builder.set_default_time_to_live(gc_clock::duration(get_int(KW_DEFAULT_TIME_TO_LIVE, DEFAULT_DEFAULT_TIME_TO_LIVE)));

    if (has_property(KW_SPECULATIVE_RETRY)) {
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -73,6 +73,7 @@ public:
    static const sstring KW_CRC_CHECK_CHANCE;

    static const sstring COMPACTION_STRATEGY_CLASS_KEY;
+    static const sstring COMPACTION_ENABLED_KEY;

    // FIXME: In origin the following consts are in CFMetaData.
    static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -72,7 +72,7 @@ void create_keyspace_statement::validate(distributed<service::storage_proxy>&, c
    std::string name;
    name.resize(_name.length());
    std::transform(_name.begin(), _name.end(), name.begin(), ::tolower);
-    if (name == db::system_keyspace::NAME) {
+    if (is_system_keyspace(name)) {
        throw exceptions::invalid_request_exception("system keyspace is not user-modifiable");
    }
    // keyspace name
--- a/database.cc
+++ b/database.cc
@@ -65,13 +65,13 @@
 #include <core/fstream.hh>
 #include <seastar/core/enum.hh>
 #include "utils/latency.hh"
-#include "utils/flush_queue.hh"
 #include "schema_registry.hh"
 #include "service/priority_manager.hh"
 #include "cell_locking.hh"
 #include <seastar/core/execution_stage.hh>
 #include "view_info.hh"
 #include "memtable-sstable.hh"
+#include "db/schema_tables.hh"

 #include "checked-file-impl.hh"
 #include "disk-error-handler.hh"
@@ -84,28 +84,10 @@ static const std::unordered_set<sstring> system_keyspaces = {
                db::system_keyspace::NAME, db::schema_tables::NAME
 };

-static bool is_system_keyspace(const sstring& name) {
+bool is_system_keyspace(const sstring& name) {
    return system_keyspaces.find(name) != system_keyspaces.end();
 }

-// Slight extension to the flush_queue type.
-class column_family::memtable_flush_queue : public utils::flush_queue<db::replay_position> {
-public:
-    template<typename Func, typename Post>
-    auto run_cf_flush(db::replay_position rp, Func&& func, Post&& post) {
-        // special case: empty rp, yet still data.
-        // We generate a few memtables with no valid, "high_rp", yet
-        // still containing data -> actual flush.
-        // And to make matters worse, we can initiate a flush of N such
-        // tables at the same time.
-        // Just queue them at the end of the queue and treat them as such.
-        if (rp == db::replay_position() && !empty()) {
-            rp = highest_key();
-        }
-        return run_with_ordered_post_op(rp, std::forward<Func>(func), std::forward<Post>(post));
-    }
-};
-
 // Used for tests where the CF exists without a database object. We need to pass a valid
 // dirty_memory manager in that case.
 thread_local dirty_memory_manager default_dirty_memory_manager;
@@ -147,7 +129,6 @@ column_family::column_family(schema_ptr schema, config config, db::commitlog* cl
    , _cache(_schema, sstables_as_snapshot_source(), global_cache_tracker())
    , _commitlog(cl)
    , _compaction_manager(compaction_manager)
-    , _flush_queue(std::make_unique<memtable_flush_queue>())
    , _counter_cell_locks(std::make_unique<cell_locker>(_schema, cl_stats))
 {
    if (!_config.enable_disk_writes) {
@@ -190,7 +171,6 @@ column_family::sstables_as_mutation_source() {
 snapshot_source
 column_family::sstables_as_snapshot_source() {
    return snapshot_source([this] () {
-        // FIXME: Will keep sstables on disk until next memtable flush. Make compaction force cache refresh.
        auto sst_set = _sstables;
        return mutation_source([this, sst_set = std::move(sst_set)] (schema_ptr s,
                const dht::partition_range& r,
@@ -779,6 +759,9 @@ column_family::open_sstable(sstables::foreign_sstable_open_info info, sstring di
 }

 void column_family::load_sstable(sstables::shared_sstable& sst, bool reset_level) {
+    if (schema()->is_counter() && !sst->has_scylla_component()) {
+        throw std::runtime_error("Loading non-Scylla SSTables containing counters is not supported. Use sstableloader instead.");
+    }
    auto shards = sst->get_shards_for_this_sstable();
    if (belongs_to_other_shard(shards)) {
        // If we're here, this sstable is shared by this and other
@@ -890,17 +873,19 @@ column_family::seal_active_streaming_memtable_immediate() {
            //
            // Lastly, we don't have any commitlog RP to update, and we don't need to deal manipulate the
            // memtable list, since this memtable was not available for reading up until this point.
-            return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority).then([this, newtab, old] {
+            return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, false, _config.background_writer_scheduling_group).then([this, newtab, old] {
                return newtab->open_data();
            }).then([this, old, newtab] () {
-                add_sstable(newtab, {engine().cpu_id()});
-                trigger_compaction();
-                // Cache synchronization must be started atomically with add_sstable()
-                if (_config.enable_cache) {
-                    return _cache.update_invalidating(*old);
-                } else {
-                    return old->clear_gently();
-                }
+                return with_semaphore(_cache_update_sem, 1, [this, newtab, old] {
+                    add_sstable(newtab, {engine().cpu_id()});
+                    trigger_compaction();
+                    // Cache synchronization must be started atomically with add_sstable()
+                    if (_config.enable_cache) {
+                        return _cache.update_invalidating(*old);
+                    } else {
+                        return old->clear_gently();
+                    }
+                });
            }).handle_exception([old] (auto ep) {
                dblog.error("failed to write streamed sstable: {}", ep);
                return make_exception_future<>(ep);
@@ -937,7 +922,7 @@ future<> column_family::seal_active_streaming_memtable_big(streaming_memtable_bi
                newtab->set_unshared();

                auto&& priority = service::get_local_streaming_write_priority();
-                return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, true).then([this, newtab, old, &smb] {
+                return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, true, _config.background_writer_scheduling_group).then([this, newtab, old, &smb] {
                    smb.sstables.emplace_back(newtab);
                }).handle_exception([] (auto ep) {
                    dblog.error("failed to write streamed sstable: {}", ep);
@@ -955,34 +940,32 @@ column_family::seal_active_memtable(memtable_list::flush_behavior ignored) {

    if (old->empty()) {
        dblog.debug("Memtable is empty");
-        return make_ready_future<>();
+        return _flush_barrier.advance_and_await();
    }
    _memtables->add_memtable();
+    _stats.memtable_switch_count++;
+    auto previous_flush = _flush_barrier.advance_and_await();
+    auto op = _flush_barrier.start();

-    assert(_highest_flushed_rp < old->replay_position()
-    || (_highest_flushed_rp == db::replay_position() && old->replay_position() == db::replay_position())
-    );
-    _highest_flushed_rp = old->replay_position();
+    auto memtable_size = old->occupancy().total_space();

-    return _flush_queue->run_cf_flush(old->replay_position(), [old, this] {
-      auto memtable_size = old->occupancy().total_space();
+    _stats.pending_flushes++;
+    _config.cf_stats->pending_memtables_flushes_count++;
+    _config.cf_stats->pending_memtables_flushes_bytes += memtable_size;

-      _config.cf_stats->pending_memtables_flushes_count++;
-      _config.cf_stats->pending_memtables_flushes_bytes += memtable_size;
-
-      return repeat([this, old] {
+    return repeat([this, old] {
        return with_lock(_sstables_lock.for_read(), [this, old] {
-            _flush_queue->check_open_gate();
            return try_flush_memtable_to_sstable(old);
        });
-      }).then([this, memtable_size] {
+    }).then([this, memtable_size, old, op = std::move(op), previous_flush = std::move(previous_flush)] () mutable {
+        _stats.pending_flushes--;
        _config.cf_stats->pending_memtables_flushes_count--;
        _config.cf_stats->pending_memtables_flushes_bytes -= memtable_size;
-      });
-    }, [old, this] {
+
        if (_commitlog) {
            _commitlog->discard_completed_segments(_schema->id(), old->rp_set());
        }
+        return previous_flush.finally([op = std::move(op)] { });
    });
    // FIXME: release commit log
    // FIXME: provide back-pressure to upper layers
@@ -1011,7 +994,7 @@ column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
    // The code as is guarantees that we'll never partially backup a
    // single sstable, so that is enough of a guarantee.
    auto&& priority = service::get_local_memtable_flush_priority();
-    return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority).then([this, newtab, old] {
+    return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, false, _config.memtable_scheduling_group).then([this, newtab, old] {
        return newtab->open_data();
    }).then_wrapped([this, old, newtab] (future<> ret) {
        dblog.debug("Flushing to {} done", newtab->get_filename());
@@ -1067,9 +1050,7 @@ column_family::stop() {
    return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
        return _compaction_manager.remove(this).then([this] {
            // Nest, instead of using when_all, so we don't lose any exceptions.
-            return _flush_queue->close().then([this] {
-                return _streaming_flush_gate.close();
-            });
+            return _streaming_flush_gate.close();
        }).then([this] {
            return _sstable_deletion_gate.close();
        });
@@ -1123,7 +1104,10 @@ distributed_loader::flush_upload_dir(distributed<database>& db, sstring ks_name,
                    auto gen = cf.calculate_generation_for_new_table();

                    // Read toc content as it will be needed for moving and deleting a sstable.
-                    return sst->read_toc().then([sst] {
+                    return sst->read_toc().then([sst, s = cf.schema()] {
+                        if (s->is_counter() && !sst->has_scylla_component()) {
+                            return make_exception_future<>(std::runtime_error("Loading non-Scylla SSTables containing counters is not supported. Use sstableloader instead."));
+                        }
                        return sst->mutate_sstable_level(0);
                    }).then([&cf, sst, gen] {
                        return sst->create_links(cf._config.datadir, gen);
@@ -1208,20 +1192,22 @@ void column_family::set_metrics() {
    auto cf = column_family_label(_schema->cf_name());
    auto ks = keyspace_label(_schema->ks_name());
    namespace ms = seastar::metrics;
-    _metrics.add_group("column_family", {
-            ms::make_histogram("read_latency", ms::description("Read latency histogram"), [this] {return _stats.estimated_read.get_histogram();})(cf)(ks),
-            ms::make_histogram("write_latency", ms::description("Write latency histogram"), [this] {return _stats.estimated_write.get_histogram();})(cf)(ks),
-            ms::make_derive("memtable_switch", ms::description("Number of times flush has resulted in the memtable being switched out"), _stats.memtable_switch_count)(cf)(ks),
-            ms::make_gauge("pending_taks", ms::description("Estimated number of tasks pending for this column family"), _stats.pending_flushes)(cf)(ks),
-            ms::make_gauge("live_disk_space", ms::description("Live disk space used"), _stats.live_disk_space_used)(cf)(ks),
-            ms::make_gauge("total_disk_space", ms::description("Total disk space used"), _stats.total_disk_space_used)(cf)(ks),
-            ms::make_gauge("live_sstable", ms::description("Live sstable count"), _stats.live_sstable_count)(cf)(ks),
-            ms::make_gauge("pending_compaction", ms::description("Estimated number of compactions pending for this column family"), _stats.pending_compactions)(cf)(ks)
-    });
-    if (_schema->ks_name() != db::system_keyspace::NAME) {
+    if (_config.enable_metrics_reporting) {
        _metrics.add_group("column_family", {
-                ms::make_gauge("cache_hit_rate", ms::description("Cache hit rate"), [this] {return float(_global_cache_hit_rate);})(cf)(ks)
+                ms::make_derive("memtable_switch", ms::description("Number of times flush has resulted in the memtable being switched out"), _stats.memtable_switch_count)(cf)(ks),
+                ms::make_gauge("pending_tasks", ms::description("Estimated number of tasks pending for this column family"), _stats.pending_flushes)(cf)(ks),
+                ms::make_gauge("live_disk_space", ms::description("Live disk space used"), _stats.live_disk_space_used)(cf)(ks),
+                ms::make_gauge("total_disk_space", ms::description("Total disk space used"), _stats.total_disk_space_used)(cf)(ks),
+                ms::make_gauge("live_sstable", ms::description("Live sstable count"), _stats.live_sstable_count)(cf)(ks),
+                ms::make_gauge("pending_compaction", ms::description("Estimated number of compactions pending for this column family"), _stats.pending_compactions)(cf)(ks)
        });
+        if (_schema->ks_name() != db::system_keyspace::NAME && _schema->ks_name() != db::schema_tables::v3::NAME && _schema->ks_name() != "system_traces") {
+            _metrics.add_group("column_family", {
+                    ms::make_histogram("read_latency", ms::description("Read latency histogram"), [this] {return _stats.estimated_read.get_histogram(std::chrono::microseconds(100));})(cf)(ks),
+                    ms::make_histogram("write_latency", ms::description("Write latency histogram"), [this] {return _stats.estimated_write.get_histogram(std::chrono::microseconds(100));})(cf)(ks),
+                    ms::make_gauge("cache_hit_rate", ms::description("Cache hit rate"), [this] {return float(_global_cache_hit_rate);})(cf)(ks)
+            });
+        }
    }
 }

@@ -1311,6 +1297,10 @@ column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>&
            } catch (sstables::atomic_deletion_cancelled& adc) {
                dblog.debug("Failed to delete sstables after compaction: {}", adc);
            }
+        }).then([this] {
+            // refresh underlying data source in row cache to prevent it from holding reference
+            // to sstables files which were previously deleted.
+            _cache.refresh_snapshot();
        });
    });
 }
@@ -1366,7 +1356,7 @@ column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool
                return sst;
        };
        return sstables::compact_sstables(*sstables_to_compact, *this, create_sstable, descriptor.max_sstable_bytes, descriptor.level,
-                cleanup).then([this, sstables_to_compact] (auto new_sstables) {
+                cleanup, _config.background_writer_scheduling_group).then([this, sstables_to_compact] (auto new_sstables) {
            _compaction_strategy.notify_completion(*sstables_to_compact, new_sstables);
            return this->rebuild_sstable_list(new_sstables, *sstables_to_compact);
        });
@@ -1374,7 +1364,7 @@ column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool
 }

 static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,
-                   const lw_shared_ptr<dht::token_range_vector>& owned_ranges,
+                   const dht::token_range_vector& owned_ranges,
                   schema_ptr s) {
    auto first = sst->get_first_partition_key();
    auto last = sst->get_last_partition_key();
@@ -1383,7 +1373,7 @@ static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,
    dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);

    // return true iff sst partition range isn't fully contained in any of the owned ranges.
-    for (auto& r : *owned_ranges) {
+    for (auto& r : owned_ranges) {
        if (r.contains(sst_token_range, dht::token_comparator())) {
            return false;
        }
@@ -1393,11 +1383,10 @@ static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,

 future<> column_family::cleanup_sstables(sstables::compaction_descriptor descriptor) {
    dht::token_range_vector r = service::get_local_storage_service().get_local_ranges(_schema->ks_name());
-    auto owned_ranges = make_lw_shared<dht::token_range_vector>(std::move(r));
-    auto sstables_to_cleanup = make_lw_shared<std::vector<sstables::shared_sstable>>(std::move(descriptor.sstables));

-    return do_for_each(*sstables_to_cleanup, [this, owned_ranges = std::move(owned_ranges), sstables_to_cleanup] (auto& sst) {
-        if (!owned_ranges->empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
+  return do_with(std::move(descriptor.sstables), std::move(r), [this] (auto& sstables, auto& owned_ranges) {
+    return do_for_each(sstables, [this, &owned_ranges] (auto& sst) {
+        if (!owned_ranges.empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
           return make_ready_future<>();
        }

@@ -1411,6 +1400,7 @@ future<> column_family::cleanup_sstables(sstables::compaction_descriptor descrip
            return this->compact_sstables(sstables::compaction_descriptor({ sst }, sst->get_sstable_level()), true);
        });
    });
+  });
 }

 // FIXME: this is just an example, should be changed to something more general
@@ -1733,7 +1723,7 @@ void distributed_loader::reshard(distributed<database>& db, sstring ks_name, sst
                        gc_clock::now(), default_io_error_handler_gen());
                    return sst;
                };
-                auto f = sstables::reshard_sstables(sstables, *cf, creator, max_sstable_bytes, level);
+                auto f = sstables::reshard_sstables(sstables, *cf, creator, max_sstable_bytes, level, cf->background_writer_scheduling_group());

                return f.then([&cf, sstables = std::move(sstables)] (std::vector<sstables::shared_sstable> new_sstables) mutable {
                    // an input sstable may belong to shard 1 and 2 and only have data which
@@ -1805,15 +1795,17 @@ future<> distributed_loader::load_new_sstables(distributed<database>& db, sstrin
    }).then([&db, ks, cf] {
        return db.invoke_on_all([ks = std::move(ks), cfname = std::move(cf)] (database& db) {
            auto& cf = db.find_column_family(ks, cfname);
-            // atomically load all opened sstables into column family.
-            for (auto& sst : cf._sstables_opened_but_not_loaded) {
-                cf.load_sstable(sst, true);
-            }
-            cf._sstables_opened_but_not_loaded.clear();
-            cf.trigger_compaction();
-            // Drop entire cache for this column family because it may be populated
-            // with stale data.
-            return cf.get_row_cache().invalidate();
+            return with_semaphore(cf._cache_update_sem, 1, [&cf] {
+                // atomically load all opened sstables into column family.
+                for (auto& sst : cf._sstables_opened_but_not_loaded) {
+                    cf.load_sstable(sst, true);
+                }
+                cf._sstables_opened_but_not_loaded.clear();
+                cf.trigger_compaction();
+                // Drop entire cache for this column family because it may be populated
+                // with stale data.
+                return cf.get_row_cache().invalidate();
+            });
        });
    }).then([&db, ks, cf] () mutable {
        return smp::submit_to(0, [&db, ks = std::move(ks), cf = std::move(cf)] () mutable {
@@ -1989,6 +1981,15 @@ future<> distributed_loader::populate_column_family(distributed<database>& db, s

 }

+inline
+flush_cpu_controller
+make_flush_cpu_controller(db::config& cfg, seastar::thread_scheduling_group* backup, std::function<double()> fn) {
+    if (cfg.auto_adjust_flush_quota()) {
+        return flush_cpu_controller(250ms, cfg.virtual_dirty_soft_limit(), std::move(fn));
+    }
+    return flush_cpu_controller(flush_cpu_controller::disabled{backup});
+}
+
 utils::UUID database::empty_version = utils::UUID_gen::get_name_UUID(bytes{});

 database::database() : database(db::config())
@@ -2002,6 +2003,10 @@ database::database(const db::config& cfg)
    , _system_dirty_memory_manager(*this, 10 << 20, cfg.virtual_dirty_soft_limit())
    , _dirty_memory_manager(*this, memory::stats().total_memory() * 0.45, cfg.virtual_dirty_soft_limit())
    , _streaming_dirty_memory_manager(*this, memory::stats().total_memory() * 0.10, cfg.virtual_dirty_soft_limit())
+    , _background_writer_scheduling_group(1ms, _cfg->background_writer_scheduling_quota())
+    , _memtable_cpu_controller(make_flush_cpu_controller(*_cfg, &_background_writer_scheduling_group, [this, limit = 2.0f * _dirty_memory_manager.throttle_threshold()] {
+        return (_dirty_memory_manager.virtual_dirty_memory()) / limit;
+    }))
    , _version(empty_version)
    , _enable_incremental_backups(cfg.incremental_backups())
 {
@@ -2011,6 +2016,32 @@ database::database(const db::config& cfg)
    dblog.info("Row: max_vector_size: {}, internal_count: {}", size_t(row::max_vector_size), size_t(row::internal_count));
 }

+void flush_cpu_controller::adjust() {
+    auto mid = _goal + (hard_dirty_limit - _goal) / 2;
+
+    auto dirty = _current_dirty();
+    if (dirty < _goal) {
+        _current_quota = dirty * q1 / _goal;
+    } else if ((dirty >= _goal) && (dirty < mid)) {
+        _current_quota = q1 + (dirty - _goal) * (q2 - q1)/(mid - _goal);
+    } else {
+        _current_quota = q2 + (dirty - mid) * (qmax - q2) / (hard_dirty_limit - mid);
+    }
+
+    dblog.trace("dirty {}, goal {}, mid {} quota {}", dirty, _goal, mid, _current_quota);
+    _scheduling_group.update_usage(_current_quota);
+}
+
+flush_cpu_controller::flush_cpu_controller(std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
+    : _goal(soft_limit / 2)
+    , _current_dirty(std::move(current_dirty))
+    , _interval(interval)
+    , _update_timer([this] { adjust(); })
+    , _scheduling_group(1ms, 0.0f)
+    , _current_scheduling_group(&_scheduling_group)
+{
+    _update_timer.arm_periodic(_interval);
+}

 void
 dirty_memory_manager::setup_collectd(sstring namestr) {
@@ -2108,6 +2139,14 @@ database::setup_metrics() {
        sm::make_gauge("queued_reads", [this] { return _read_concurrency_sem.waiters(); },
                       sm::description("Holds the number of currently queued read operations.")),

+        sm::make_gauge("active_reads_streaming", [this] { return max_streaming_concurrent_reads() - _streaming_concurrency_sem.current(); },
+                       sm::description(seastar::format("Holds the number of currently active read operations issued on behalf of streaming "
+                                                       "If this value gets close to {} we are likely to start dropping new read requests. "
+                                                       "In that case sstable_read_queue_overloads is going to get a non-zero value.", max_streaming_concurrent_reads()))),
+
+        sm::make_gauge("queued_reads_streaming", [this] { return _streaming_concurrency_sem.waiters(); },
+                       sm::description("Holds the number of currently queued read operations on behalf of streaming.")),
+
        sm::make_gauge("active_reads_system_keyspace", [this] { return max_system_concurrent_reads() - _system_read_concurrency_sem.current(); },
                       sm::description(seastar::format("Holds the number of currently active read operations from \"system\" keyspace tables. "
                                                       "If this vlaue gets close to {} we are likely to start dropping new read requests. "
@@ -2119,6 +2158,9 @@ database::setup_metrics() {
        sm::make_gauge("total_result_bytes", [this] { return get_result_memory_limiter().total_used_memory(); },
                       sm::description("Holds the current amount of memory used for results.")),

+        sm::make_gauge("cpu_flush_quota", [this] { return _memtable_cpu_controller.current_quota(); },
+                             sm::description("The current quota for memtable CPU scheduling group")),
+
        sm::make_derive("short_data_queries", _stats->short_data_queries,
                       sm::description("The rate of data queries (data or digest reads) that returned less rows than requested due to result size limiting.")),

@@ -2330,7 +2372,7 @@ database::init_commitlog() {
                _commitlog->discard_completed_segments(id);
                return;
            }
-            _column_families[id]->flush(pos);
+            _column_families[id]->flush();
        }).release(); // we have longer life time than CL. Ignore reg anchor
    });
 }
@@ -2444,12 +2486,12 @@ void database::remove(const column_family& cf) {
    }
 }

-future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func tsf) {
+future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func tsf, bool snapshot) {
    auto uuid = find_uuid(ks_name, cf_name);
    auto cf = _column_families.at(uuid);
    remove(*cf);
    auto& ks = find_keyspace(ks_name);
-    return truncate(ks, *cf, std::move(tsf)).then([this, cf] {
+    return truncate(ks, *cf, std::move(tsf), snapshot).then([this, cf] {
        return cf->stop();
    }).then([this, cf] {
        return make_ready_future<>();
@@ -2589,6 +2631,9 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
    cfg.streaming_read_concurrency_config = _config.streaming_read_concurrency_config;
    cfg.cf_stats = _config.cf_stats;
    cfg.enable_incremental_backups = _config.enable_incremental_backups;
+    cfg.background_writer_scheduling_group = _config.background_writer_scheduling_group;
+    cfg.memtable_scheduling_group = _config.memtable_scheduling_group;
+    cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics();

    return cfg;
 }
@@ -3035,7 +3080,7 @@ void column_family::apply_streaming_big_mutation(schema_ptr m_schema, utils::UUI
 void
 column_family::check_valid_rp(const db::replay_position& rp) const {
    if (rp != db::replay_position() && rp < _lowest_allowed_rp) {
-        throw replay_position_reordered_exception();
+        throw mutation_reordered_with_truncate_exception();
    }
 }

@@ -3079,10 +3124,6 @@ lw_shared_ptr<memtable> memtable_list::new_memtable() {
 }

 future<> dirty_memory_manager::flush_one(memtable_list& mtlist, semaphore_units<> permit) {
-    if (mtlist.back()->empty()) {
-        return make_ready_future<>();
-    }
-
    auto* region = &(mtlist.back()->region());
    auto schema = mtlist.back()->schema();

@@ -3185,25 +3226,24 @@ future<mutation> database::apply_counter_update(schema_ptr s, const frozen_mutat
    }
 }

+static future<> maybe_handle_reorder(std::exception_ptr exp) {
+    try {
+        std::rethrow_exception(exp);
+        return make_exception_future(exp);
+    } catch (mutation_reordered_with_truncate_exception&) {
+        // This mutation raced with a truncate, so we can just drop it.
+        dblog.debug("replay_position reordering detected");
+        return make_ready_future<>();
+    }
+}
+
 future<> database::apply_with_commitlog(column_family& cf, const mutation& m, timeout_clock::time_point timeout) {
    if (cf.commitlog() != nullptr) {
        return do_with(freeze(m), [this, &m, &cf, timeout] (frozen_mutation& fm) {
            commitlog_entry_writer cew(m.schema(), fm);
            return cf.commitlog()->add_entry(m.schema()->id(), cew, timeout);
        }).then([this, &m, &cf, timeout] (db::rp_handle h) {
-            return apply_in_memory(m, cf, std::move(h), timeout).handle_exception([this, &cf, &m, timeout] (auto ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (replay_position_reordered_exception&) {
-                    // expensive, but we're assuming this is super rare.
-                    // if we failed to apply the mutation due to future re-ordering
-                    // (which should be the ever only reason for rp mismatch in CF)
-                    // let's just try again, add the mutation to the CL once more,
-                    // and assume success in inevitable eventually.
-                    dblog.debug("replay_position reordering detected");
-                    return this->apply_with_commitlog(cf, m, timeout);
-                }
-            });
+            return apply_in_memory(m, cf, std::move(h), timeout).handle_exception(maybe_handle_reorder);
        });
    }
    return apply_in_memory(m, cf, {}, timeout);
@@ -3214,19 +3254,7 @@ future<> database::apply_with_commitlog(schema_ptr s, column_family& cf, utils::
    if (cl != nullptr) {
        commitlog_entry_writer cew(s, m);
        return cf.commitlog()->add_entry(uuid, cew, timeout).then([&m, this, s, timeout, cl](db::rp_handle h) {
-            return this->apply_in_memory(m, s, std::move(h), timeout).handle_exception([this, s, &m, timeout] (auto ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (replay_position_reordered_exception&) {
-                    // expensive, but we're assuming this is super rare.
-                    // if we failed to apply the mutation due to future re-ordering
-                    // (which should be the ever only reason for rp mismatch in CF)
-                    // let's just try again, add the mutation to the CL once more,
-                    // and assume success in inevitable eventually.
-                    dblog.debug("replay_position reordering detected");
-                    return this->apply(s, m, timeout);
-                }
-            });
+            return this->apply_in_memory(m, s, std::move(h), timeout).handle_exception(maybe_handle_reorder);
        });
    }
    return apply_in_memory(m, std::move(s), {}, timeout);
@@ -3317,10 +3345,17 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        ++_stats->sstable_read_queue_overloaded;
        throw std::runtime_error("sstable inactive read queue overloaded");
    };
-    cfg.streaming_read_concurrency_config = cfg.read_concurrency_config;
-    cfg.streaming_read_concurrency_config.timeout = {};
+    // No timeouts or queue length limits - a failure here can kill an entire repair.
+    // Trust the caller to limit concurrency.
+    cfg.streaming_read_concurrency_config.sem = &_streaming_concurrency_sem;
    cfg.cf_stats = &_cf_stats;
    cfg.enable_incremental_backups = _enable_incremental_backups;
+
+    if (_cfg->background_writer_scheduling_quota() < 1.0f) {
+        cfg.background_writer_scheduling_group = &_background_writer_scheduling_group;
+        cfg.memtable_scheduling_group = _memtable_cpu_controller.scheduling_group();
+    }
+    cfg.enable_metrics_reporting = _cfg->enable_keyspace_column_family_metrics();
    return cfg;
 }

@@ -3444,10 +3479,10 @@ future<> database::truncate(sstring ksname, sstring cfname, timestamp_func tsf)
    return truncate(ks, cf, std::move(tsf));
 }

-future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_func tsf)
+future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_func tsf, bool with_snapshot)
 {
    const auto durable = ks.metadata()->durable_writes();
-    const auto auto_snapshot = get_config().auto_snapshot();
+    const auto auto_snapshot = with_snapshot && get_config().auto_snapshot();

    // Force mutations coming in to re-acquire higher rp:s
    // This creates a "soft" ordering, in that we will guarantee that
@@ -3774,35 +3809,6 @@ future<std::unordered_map<sstring, column_family::snapshot_details>> column_fami
 }

 future<> column_family::flush() {
-    _stats.pending_flushes++;
-
-    // highest_flushed_rp is only updated when we flush. If the memtable is currently alive, then
-    // the most up2date replay position is the one that's in there now. Otherwise, if the memtable
-    // hasn't received any writes yet, that's the one from the last flush we made.
-    auto desired_rp = _memtables->back()->empty() ? _highest_flushed_rp : _memtables->back()->replay_position();
-    return _memtables->request_flush().finally([this, desired_rp] {
-        _stats.pending_flushes--;
-        // In origin memtable_switch_count is incremented inside
-        // ColumnFamilyMeetrics Flush.run
-        _stats.memtable_switch_count++;
-        // wait for all up until us.
-        return _flush_queue->wait_for_pending(desired_rp);
-    });
-}
-
-future<> column_family::flush(const db::replay_position& pos) {
-    // Technically possible if we've already issued the
-    // sstable write, but it is not done yet.
-    if (pos < _highest_flushed_rp) {
-        return make_ready_future<>();
-    }
-
-    // TODO: Origin looks at "secondary" memtables
-    // It also consideres "minReplayPosition", which is simply where
-    // the CL "started" (the first ever RP in this run).
-    // We ignore this for now and just say that if we're asked for
-    // a CF and it exists, we pretty much have to have data that needs
-    // flushing. Let's do it.
    return _memtables->request_flush();
 }

@@ -3824,12 +3830,14 @@ future<> column_family::flush_streaming_mutations(utils::UUID plan_id, dht::part
            return _streaming_memtables->seal_active_memtable(memtable_list::flush_behavior::delayed).then([this] {
                return _streaming_flush_phaser.advance_and_await();
            }).then([this, sstables = std::move(sstables), ranges = std::move(ranges)] () mutable {
-                for (auto&& sst : sstables) {
-                    // seal_active_streaming_memtable_big() ensures sst is unshared.
-                    this->add_sstable(sst, {engine().cpu_id()});
-                }
-                this->trigger_compaction();
-                return _cache.invalidate(std::move(ranges));
+                return with_semaphore(_cache_update_sem, 1, [this, sstables = std::move(sstables), ranges = std::move(ranges)] () mutable {
+                    for (auto&& sst : sstables) {
+                        // seal_active_streaming_memtable_big() ensures sst is unshared.
+                        this->add_sstable(sst, {engine().cpu_id()});
+                    }
+                    this->trigger_compaction();
+                    return _cache.invalidate(std::move(ranges));
+                });
            });
        });
    });
@@ -4119,11 +4127,12 @@ void column_family::drop_hit_rate(gms::inet_address addr) {
 }

 future<>
-write_memtable_to_sstable(memtable& mt, sstables::shared_sstable sst, bool backup, const io_priority_class& pc, bool leave_unsealed) {
+write_memtable_to_sstable(memtable& mt, sstables::shared_sstable sst, bool backup, const io_priority_class& pc, bool leave_unsealed, seastar::thread_scheduling_group *tsg) {
    sstables::sstable_writer_config cfg;
    cfg.replay_position = mt.replay_position();
    cfg.backup = backup;
    cfg.leave_unsealed = leave_unsealed;
+    cfg.thread_scheduling_group = tsg;
    return sst->write_components(mt.make_flush_reader(mt.schema(), pc), mt.partition_count(), mt.schema(), cfg, pc);
 }

--- a/database.hh
+++ b/database.hh
@@ -77,6 +77,8 @@
 #include <boost/intrusive/parent_from_member.hpp>
 #include "db/view/view.hh"
 #include "lister.hh"
+#include "utils/phased_barrier.hh"
+#include "cpu_controller.hh"

 class cell_locker;
 class cell_locker_stats;
@@ -114,7 +116,7 @@ void make(database& db, bool durable, bool volatile_testing_only);
 }
 }

-class replay_position_reordered_exception : public std::exception {};
+class mutation_reordered_with_truncate_exception : public std::exception {};

 using shared_memtable = lw_shared_ptr<memtable>;
 class memtable_list;
@@ -429,6 +431,9 @@ public:
        restricted_mutation_reader_config read_concurrency_config;
        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
+        seastar::thread_scheduling_group* background_writer_scheduling_group = nullptr;
+        seastar::thread_scheduling_group* memtable_scheduling_group = nullptr;
+        bool enable_metrics_reporting = false;
    };
    struct no_commitlog {};
    struct stats {
@@ -538,7 +543,6 @@ private:
    mutable row_cache _cache; // Cache covers only sstables.
    std::experimental::optional<int64_t> _sstable_generation = {};

-    db::replay_position _highest_flushed_rp;
    db::replay_position _highest_rp;
    db::replay_position _lowest_allowed_rp;

@@ -546,15 +550,7 @@ private:
    db::commitlog* _commitlog;
    compaction_manager& _compaction_manager;
    int _compaction_disabled = 0;
-    class memtable_flush_queue;
-    std::unique_ptr<memtable_flush_queue> _flush_queue;
-    // Because streaming mutations bypass the commitlog, there is
-    // no need for the complications of the flush queue. Besides, it
-    // is easier to just use a common gate than it is to modify the flush_queue
-    // to work both with and without a replay position.
-    //
-    // Last but not least, we seldom need to guarantee any ordering here: as long
-    // as all data is waited for, we're good.
+    utils::phased_barrier _flush_barrier;
    seastar::gate _streaming_flush_gate;
    std::vector<view_ptr> _views;
    semaphore _cache_update_sem{1};
@@ -753,7 +749,6 @@ public:
    void start();
    future<> stop();
    future<> flush();
-    future<> flush(const db::replay_position&);
    future<> flush_streaming_mutations(utils::UUID plan_id, dht::partition_range_vector ranges = dht::partition_range_vector{});
    future<> fail_streaming_mutations(utils::UUID plan_id);
    future<> clear(); // discards memtable(s) without flushing them to disk.
@@ -864,6 +859,10 @@ public:
        return _config.cf_stats;
    }

+    seastar::thread_scheduling_group* background_writer_scheduling_group() {
+        return _config.background_writer_scheduling_group;
+    }
+
    compaction_manager& get_compaction_manager() const {
        return _compaction_manager;
    }
@@ -1072,6 +1071,9 @@ public:
        restricted_mutation_reader_config read_concurrency_config;
        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
+        seastar::thread_scheduling_group* background_writer_scheduling_group = nullptr;
+        seastar::thread_scheduling_group* memtable_scheduling_group = nullptr;
+        bool enable_metrics_reporting = false;
    };
 private:
    std::unique_ptr<locator::abstract_replication_strategy> _replication_strategy;
@@ -1154,6 +1156,7 @@ public:
 private:
    ::cf_stats _cf_stats;
    static constexpr size_t max_concurrent_reads() { return 100; }
+    static constexpr size_t max_streaming_concurrent_reads() { return 10; } // They're rather heavyweight, so limit more
    static constexpr size_t max_system_concurrent_reads() { return 10; }
    static constexpr size_t max_concurrent_sstable_loads() { return 3; }
    struct db_stats {
@@ -1177,7 +1180,11 @@ private:
    dirty_memory_manager _dirty_memory_manager;
    dirty_memory_manager _streaming_dirty_memory_manager;

+    seastar::thread_scheduling_group _background_writer_scheduling_group;
+    flush_cpu_controller _memtable_cpu_controller;
+
    semaphore _read_concurrency_sem{max_concurrent_reads()};
+    semaphore _streaming_concurrency_sem{max_streaming_concurrent_reads()};
    restricted_mutation_reader_config _read_concurrency_config;
    semaphore _system_read_concurrency_sem{max_system_concurrent_reads()};
    restricted_mutation_reader_config _system_read_concurrency_config;
@@ -1332,10 +1339,10 @@ public:

    /** Truncates the given column family */
    future<> truncate(sstring ksname, sstring cfname, timestamp_func);
-    future<> truncate(const keyspace& ks, column_family& cf, timestamp_func);
+    future<> truncate(const keyspace& ks, column_family& cf, timestamp_func, bool with_snapshot = true);

    bool update_column_family(schema_ptr s);
-    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func);
+    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func, bool with_snapshot = true);
    void remove(const column_family&);

    const logalloc::region_group& dirty_memory_region_group() const {
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -511,6 +511,7 @@ public:
        if (shutdown) {
            auto me = shared_from_this();
            return _gate.close().then([me] {
+                me->_closed = true;
                return me->sync().finally([me] {
                    // When we get here, nothing should add ops,
                    // and we should have waited out all pending.
@@ -1319,6 +1320,7 @@ future<> db::commitlog::segment_manager::shutdown() {
                return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true));
            });
        }).finally([this] {
+            discard_unused_segments();
            // Now that the gate is closed and requests completed we are sure nobody else will pop()
            return clear_reserve_segments().finally([this] {
                return std::move(_reserve_replenisher).then_wrapped([this] (auto f) {
--- a/db/config.hh
+++ b/db/config.hh
@@ -166,6 +166,12 @@ public:
     */

 #define _make_config_values(val)                \
+    val(background_writer_scheduling_quota, double, 1.0, Used, \
+            "max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5." \
+    )   \
+    val(auto_adjust_flush_quota, bool, false, Used, \
+            "true: auto-adjust quota for flush processes. false: put everyone together in the static background writer group - if background writer group is enabled. Not intended for setting in normal operations" \
+    )   \
    /* Initialization properties */             \
    /* The minimal properties needed for configuring a cluster. */  \
    val(cluster_name, sstring, "Test Cluster", Used,   \
@@ -330,7 +336,7 @@ public:
    val(sstable_preemptive_open_interval_in_mb, uint32_t, 50, Unused,     \
            "When compacting, the replacement opens SSTables before they are completely written and uses in place of the prior SSTables for any range previously written. This setting helps to smoothly transfer reads between the SSTables by reducing page cache churn and keeps hot rows hot."  \
    )                                                   \
-    val(defragment_memory_on_idle, bool, true, Used, "Set to true to defragment memory when the cpu is idle.  This reduces the amount of work Scylla performs when processing client requests.") \
+    val(defragment_memory_on_idle, bool, false, Used, "When set to true, will defragment memory when the cpu is idle.  This reduces the amount of work Scylla performs when processing client requests.") \
    /* Memtable settings */ \
    val(memtable_allocation_type, sstring, "heap_buffers", Invalid,     \
            "Specify the way Cassandra allocates and manages memtable memory. See Off-heap memtables in Cassandra 2.1. Options are:\n"  \
@@ -754,6 +760,8 @@ public:
    val(replace_address_first_boot, sstring, "", Used, "Like replace_address option, but if the node has been bootstrapped successfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.") \
    val(override_decommission, bool, false, Used, "Set true to force a decommissioned node to join the cluster") \
    val(ring_delay_ms, uint32_t, 30 * 1000, Used, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.") \
+    val(fd_max_interval_ms, uint32_t, 2 * 1000, Used, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.") \
+    val(fd_initial_value_ms, uint32_t, 2 * 1000, Used, "The initial failure_detector interval time in milliseconds.") \
    val(shutdown_announce_in_ms, uint32_t, 2 * 1000, Used, "Time a node waits after sending gossip shutdown message in milliseconds. Same as -Dcassandra.shutdown_announce_in_ms in cassandra.") \
    val(developer_mode, bool, false, Used, "Relax environment checks. Setting to true can reduce performance and reliability significantly.") \
    val(skip_wait_for_gossip_to_settle, int32_t, -1, Used, "An integer to configure the wait for gossip to settle. -1: wait normally, 0: do not wait at all, n: wait for at most n polls. Same as -Dcassandra.skip_wait_for_gossip_to_settle in cassandra.") \
@@ -765,6 +773,7 @@ public:
    val(abort_on_lsa_bad_alloc, bool, false, Used, "Abort when allocation in LSA region fails") \
    val(murmur3_partitioner_ignore_msb_bits, unsigned, 0, Used, "Number of most siginificant token bits to ignore in murmur3 partitioner; increase for very large clusters") \
    val(virtual_dirty_soft_limit, double, 0.6, Used, "Soft limit of virtual dirty memory expressed as a portion of the hard limit") \
+    val(enable_keyspace_column_family_metrics, bool, false, Used, "Enable per keyspace and per column family metrics reporting") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
--- a/db/consistency_level.hh
+++ b/db/consistency_level.hh
@@ -162,6 +162,14 @@ inline void assure_sufficient_live_nodes(
        const PendingRange& pending_endpoints = std::array<gms::inet_address, 0>()) {
    size_t need = block_for(ks, cl);

+    auto adjust_live_for_error = [] (size_t live, size_t pending) {
+        // DowngradingConsistencyRetryPolicy uses alive replicas count from Unavailable
+        // exception to adjust CL for retry. When pending node is present CL is increased
+        // by 1 internally, so reported number of live nodes has to be adjusted to take
+        // this into account
+        return pending <= live ? live - pending : 0;
+    };
+
    switch (cl) {
    case consistency_level::ANY:
        // local hint is acceptable, and local node is always live
@@ -176,7 +184,7 @@ inline void assure_sufficient_live_nodes(
        size_t pending = count_local_endpoints(pending_endpoints);
        if (local_live < need + pending) {
            cl_logger.debug("Local replicas {} are insufficient to satisfy LOCAL_QUORUM requirement of needed {} and pending {}", live_endpoints, local_live, pending);
-            throw exceptions::unavailable_exception(cl, need, local_live);
+            throw exceptions::unavailable_exception(cl, need, adjust_live_for_error(local_live, pending));
        }
        break;
    }
@@ -190,7 +198,7 @@ inline void assure_sufficient_live_nodes(
        size_t pending = pending_endpoints.size();
        if (live < need + pending) {
            cl_logger.debug("Live nodes {} do not satisfy ConsistencyLevel ({} required, {} pending)", live, need, pending);
-            throw exceptions::unavailable_exception(cl, need, live);
+            throw exceptions::unavailable_exception(cl, need, adjust_live_for_error(live, pending));
        }
        break;
    }
--- a/db/legacy_schema_migrator.cc
+++ b/db/legacy_schema_migrator.cc
@@ -66,8 +66,8 @@ class migrator {
 public:
    static const std::unordered_set<sstring> legacy_schema_tables;

-    migrator(cql3::query_processor& qp)
-                    : _qp(qp) {
+    migrator(sharded<service::storage_proxy>& sp, cql3::query_processor& qp)
+                    : _sp(sp), _qp(qp) {
    }
    migrator(migrator&&) = default;

@@ -147,15 +147,18 @@ public:
        auto cq = fmt_query(fmt, db::system_keyspace::legacy::COLUMNS);
        auto zq = fmt_query(fmt, db::system_keyspace::legacy::TRIGGERS);

-        typedef std::tuple<future<result_set_type>, future<result_set_type>, future<result_set_type>> result_tuple;
+        typedef std::tuple<future<result_set_type>, future<result_set_type>, future<result_set_type>, future<db::schema_tables::legacy::schema_mutations>> result_tuple;

        return when_all(_qp.execute_internal(tq, { dst.name, cf_name }),
                        _qp.execute_internal(cq, { dst.name, cf_name }),
-                        _qp.execute_internal(zq, { dst.name, cf_name })).then([this, &dst, cf_name, timestamp](result_tuple&& t) {
+                        _qp.execute_internal(zq, { dst.name, cf_name }),
+                        db::schema_tables::legacy::read_table_mutations(_sp, dst.name, cf_name, db::system_keyspace::legacy::column_families()))
+                    .then([this, &dst, cf_name, timestamp](result_tuple&& t) {

            result_set_type tables = std::get<0>(t).get0();
            result_set_type columns = std::get<1>(t).get0();
            result_set_type triggers = std::get<2>(t).get0();
+            db::schema_tables::legacy::schema_mutations sm = std::get<3>(t).get0();

            row_type& td = tables->one();

@@ -165,6 +168,8 @@ public:

            schema_builder builder(dst.name, cf_name, id);

+            builder.with_version(sm.digest());
+
            cf_type cf = sstring_to_cf_type(td.get_or("type", sstring("standard")));
            if (cf == cf_type::super) {
                fail(unimplemented::cause::SUPER);
@@ -183,6 +188,7 @@ public:
                if (default_validator->is_counter()) {
                    builder.set_is_counter(true);
                }
+                builder.set_default_validation_class(default_validator);
            }

            /*
@@ -191,10 +197,8 @@ public:
             * but we can trust is_dense value of false.
             */
            auto is_dense = td.get_opt<bool>("is_dense");
-            if (is_dense && !*is_dense) {
-                builder.set_is_dense(false);
-            } else {
-                auto calulated_is_dense = [&] {
+            if (!is_dense || *is_dense) {
+                is_dense = [&] {
                    /*
                     * As said above, this method is only here because we need to deal with thrift upgrades.
                     * Once a CF has been "upgraded", i.e. we've rebuilt and save its CQL3 metadata at least once,
@@ -252,40 +256,48 @@ public:
                        return comparator.compare(off, end - off, utf8_type->name()) == 0;
                    };

-                    if (regular) {
-                        auto name = regular->get_or("column_name", bytes());
-                        // This is a lame attempt at determining if this was in fact a compact_value column
-                        if (!max_cl_idx || (!name.empty() && name != to_bytes("value"))
-                                        || db::schema_tables::parse_type(regular->get_as<sstring>("type")) != default_validator) {
-                            return false;
-                        }
-                        // Ok, we will assume this was in fact a (scylla-created) compact value.
-                    }
-
                    if (max_cl_idx) {
                        auto n = std::count(comparator.begin(), comparator.end(), ','); // num comp - 1
                        return *max_cl_idx == n;
                    }

+                    if (regular) {
+                        return false;
+                    }
+
                    return !is_cql3_only_pk_comparator(comparator);

                }();

-                builder.set_is_dense(calulated_is_dense);
-
                // now, if switched to sparse, remove redundant compact_value column and the last clustering column,
                // directly copying CASSANDRA-11502 logic. See CASSANDRA-11315.

-                filter_sparse = !calulated_is_dense && is_dense.value_or(true);
+                filter_sparse = !*is_dense;
            }
+            builder.set_is_dense(*is_dense);
+
+            auto is_cql = !*is_dense && is_compound;
+            auto is_static_compact = !*is_dense && !is_compound;
+
+            // org.apache.cassandra.schema.LegacySchemaMigrator#isEmptyCompactValueColumn
+            auto is_empty_compact_value = [](const cql3::untyped_result_set::row& column_row) {
+                auto kind_str = column_row.get_as<sstring>("type");
+                // Cassandra only checks for "compact_value", but Scylla generates "regular" instead (#2586)
+                return (kind_str == "compact_value" || kind_str == "regular")
+                       && column_row.get_as<sstring>("column_name").empty();
+            };

            for (auto& row : *columns) {
                auto kind_str = row.get_as<sstring>("type");
                auto kind = db::schema_tables::deserialize_kind(kind_str);
                auto component_index = kind > column_kind::clustering_key ? 0 : column_id(row.get_or("component_index", 0));
-                auto name = row.get_or("column_name", bytes());
+                auto name = row.get_or<sstring>("column_name", sstring());
                auto validator = db::schema_tables::parse_type(row.get_as<sstring>("validator"));

+                if (is_empty_compact_value(row)) {
+                    continue;
+                }
+
                if (filter_sparse) {
                    if (kind_str == "compact_value") {
                        continue;
@@ -329,7 +341,7 @@ public:
                            type = "VALUES";
                        }
                    }
-                    auto column = cql3::util::maybe_quote(utf8_type->to_string(name));
+                    auto column = cql3::util::maybe_quote(name);
                    options["target"] = validator->is_collection()
                                    ? type + "(" + column + ")"
                                    : column;
@@ -339,7 +351,26 @@ public:
                    builder.with_index(index_metadata(index_name, options, *index_kind));
                }

-                builder.with_column(std::move(name), std::move(validator), kind, component_index);
+                data_type column_name_type = [&] {
+                    if (is_static_compact && kind == column_kind::regular_column) {
+                        return db::schema_tables::parse_type(comparator);
+                    }
+                    return utf8_type;
+                }();
+                auto column_name = [&] {
+                    try {
+                        return column_name_type->from_string(name);
+                    } catch (marshal_exception) {
+                        // #2597: Scylla < 2.0 writes names in serialized form, try to recover
+                        column_name_type->validate(to_bytes_view(name));
+                        return to_bytes(name);
+                    }
+                }();
+                builder.with_column(std::move(column_name), std::move(validator), kind, component_index);
+            }
+
+            if (is_static_compact) {
+                builder.set_regular_column_name_type(db::schema_tables::parse_type(comparator));
            }

            if (td.has("read_repair_chance")) {
@@ -414,8 +445,6 @@ public:
                throw unsupported_feature("triggers");
            }

-            // TODO: table upgrades as in origin converter.
-
            dst.tables.emplace_back(table{timestamp, builder.build() });
        });
    }
@@ -517,21 +546,13 @@ public:
        });
    }

-    future<> unload_legacy_tables() {
-        return _qp.db().invoke_on_all([](database& db) {
-            for (auto& cfname : legacy_schema_tables) {
-                auto& cf = db.find_column_family(db::system_keyspace::NAME, cfname);
-                db.remove(cf);
-            }
-        });
-    }
-
-    future<> truncate_legacy_tables() {
-        mlogger.info("Truncating legacy schema tables");
-        return do_with(utils::make_joinpoint([] { return db_clock::now();}),[this](auto& tsf) {
-            return _qp.db().invoke_on_all([&tsf](database& db) {
-                return parallel_for_each(legacy_schema_tables, [&db, &tsf](const sstring& cfname) {
-                    return db.truncate(db::system_keyspace::NAME, cfname, [&tsf] { return tsf.value(); });
+    future<> drop_legacy_tables() {
+        mlogger.info("Dropping legacy schema tables");
+        return parallel_for_each(legacy_schema_tables, [this](const sstring& cfname) {
+            return do_with(utils::make_joinpoint([] { return db_clock::now();}),[this, cfname](auto& tsf) {
+                auto with_snapshot = !_keyspaces.empty();
+                return _qp.db().invoke_on_all([&tsf, cfname, with_snapshot](database& db) {
+                    return db.drop_column_family(db::system_keyspace::NAME, cfname, [&tsf] { return tsf.value(); }, with_snapshot);
                });
            });
        });
@@ -590,18 +611,15 @@ public:

    future<> migrate() {
        return read_all_keyspaces().then([this]() {
-            if (_keyspaces.empty()) {
-                return unload_legacy_tables();
-            }
            // write metadata to the new schema tables
            return store_keyspaces_in_new_schema_tables().then(std::bind(&migrator::migrate_indexes, this))
                                                .then(std::bind(&migrator::flush_schemas, this))
-                                                .then(std::bind(&migrator::truncate_legacy_tables, this))
-                                                .then(std::bind(&migrator::unload_legacy_tables, this))
+                                                .then(std::bind(&migrator::drop_legacy_tables, this))
                                                .then([] { mlogger.info("Completed migration of legacy schema tables"); });
        });
    }

+    sharded<service::storage_proxy>& _sp;
    cql3::query_processor& _qp;
    std::vector<keyspace> _keyspaces;
 };
@@ -620,7 +638,7 @@ const std::unordered_set<sstring> migrator::legacy_schema_tables = {
 }

 future<>
-db::legacy_schema_migrator::migrate(cql3::query_processor& qp) {
-    return do_with(migrator(qp), std::bind(&migrator::migrate, std::placeholders::_1));
+db::legacy_schema_migrator::migrate(sharded<service::storage_proxy>& sp, cql3::query_processor& qp) {
+    return do_with(migrator(sp, qp), std::bind(&migrator::migrate, std::placeholders::_1));
 }

--- a/db/legacy_schema_migrator.hh
+++ b/db/legacy_schema_migrator.hh
@@ -48,10 +48,14 @@ namespace cql3 {
 class query_processor;
 }

+namespace service {
+class storage_proxy;
+}
+
 namespace db {
 namespace legacy_schema_migrator {

-future<> migrate(cql3::query_processor&);
+future<> migrate(sharded<service::storage_proxy>&, cql3::query_processor&);

 }
 }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -64,6 +64,7 @@
 #include "db/config.hh"
 #include "md5_hasher.hh"

+#include <boost/algorithm/string/predicate.hpp>
 #include <boost/range/algorithm/copy.hpp>
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/join.hpp>
@@ -82,6 +83,8 @@ namespace schema_tables {

 logging::logger slogger("schema_tables");

+const sstring version = "3";
+
 struct push_back_and_return {
    std::vector<mutation> muts;

@@ -149,8 +152,8 @@ static void add_index_to_schema_mutation(schema_ptr table,
                const index_metadata& index, api::timestamp_type timestamp,
                mutation& mutation);

-static void drop_column_from_schema_mutation(schema_ptr,
-                const column_definition&, long timestamp,
+static void drop_column_from_schema_mutation(schema_ptr schema_table, schema_ptr table,
+                const sstring& column_name, long timestamp,
                std::vector<mutation>&);

 static void drop_index_from_schema_mutation(schema_ptr table,
@@ -165,13 +168,12 @@ static void prepare_builder_from_table_row(schema_builder&, const query::result_

 using namespace v3;

-std::vector<const char*> ALL { KEYSPACES, TABLES, COLUMNS, DROPPED_COLUMNS, TRIGGERS, VIEWS, TYPES, FUNCTIONS, AGGREGATES, INDEXES };
+std::vector<const char*> ALL { KEYSPACES, TABLES, SCYLLA_TABLES, COLUMNS, DROPPED_COLUMNS, TRIGGERS, VIEWS, TYPES, FUNCTIONS, AGGREGATES, INDEXES };

 using days = std::chrono::duration<int, std::ratio<24 * 3600>>;

-/** add entries to system.schema_* for the hardcoded system definitions */
-future<> save_system_keyspace_schema() {
-    auto& ks = db::qctx->db().find_keyspace(NAME);
+future<> save_system_schema(const sstring & ksname) {
+    auto& ks = db::qctx->db().find_keyspace(ksname);
    auto ksm = ks.metadata();

    // delete old, possibly obsolete entries in schema tables
@@ -185,6 +187,11 @@ future<> save_system_keyspace_schema() {
    });
 }

+/** add entries to system_schema.* for the hardcoded system definitions */
+future<> save_system_keyspace_schema() {
+    return save_system_schema(NAME);
+}
+
 namespace v3 {

 static constexpr auto schema_gc_grace = std::chrono::duration_cast<std::chrono::seconds>(days(7)).count();
@@ -256,6 +263,21 @@ schema_ptr tables() {
    return schema;
 }

+// Holds Scylla-specific table metadata.
+schema_ptr scylla_tables() {
+    static thread_local auto schema = [] {
+        auto id = generate_legacy_id(NAME, SCYLLA_TABLES);
+        return schema_builder(NAME, SCYLLA_TABLES, stdx::make_optional(id))
+            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
+            .with_column("table_name", utf8_type, column_kind::clustering_key)
+            .with_column("version", uuid_type)
+            .set_gc_grace_seconds(schema_gc_grace)
+            .with_version(generate_schema_version(id))
+            .build();
+    }();
+    return schema;
+}
+
 schema_ptr columns() {
    static thread_local auto schema = [] {
        schema_builder builder(make_lw_shared(::schema(generate_legacy_id(NAME, COLUMNS), NAME, COLUMNS,
@@ -519,7 +541,7 @@ future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>&
            for (auto&& p : rs->partitions()) {
                auto mut = p.mut().unfreeze(s);
                auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
-                if (partition_key == NAME) {
+                if (is_system_keyspace(partition_key)) {
                    continue;
                }
                mutations.emplace_back(std::move(mut));
@@ -552,7 +574,7 @@ future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<ser
            for (auto&& p : rs->partitions()) {
                auto mut = p.mut().unfreeze(s);
                auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
-                if (partition_key == NAME) {
+                if (is_system_keyspace(partition_key)) {
                    continue;
                }
                results.emplace_back(std::move(p.mut()));
@@ -727,6 +749,33 @@ read_tables_for_keyspaces(distributed<service::storage_proxy>& proxy, const std:
    return result;
 }

+mutation compact_for_schema_digest(const mutation& m) {
+    // Cassandra is skipping tombstones from digest calculation
+    // to avoid disagreements due to tombstone GC.
+    // See https://issues.apache.org/jira/browse/CASSANDRA-6862.
+    // We achieve similar effect with compact_for_compaction().
+    mutation m_compacted(m);
+    m_compacted.partition().compact_for_compaction(*m.schema(), always_gc, gc_clock::time_point::max());
+    return m_compacted;
+}
+
+// Applies deletion of the "version" column to a system_schema.scylla_tables mutation.
+static void delete_schema_version(mutation& m) {
+    if (m.column_family_id() != scylla_tables()->id()) {
+        return;
+    }
+    const column_definition& version_col = *scylla_tables()->get_column_definition(to_bytes("version"));
+    for (auto&& row : m.partition().clustered_rows()) {
+        auto&& cells = row.row().cells();
+        auto&& cell = cells.find_cell(version_col.id);
+        api::timestamp_type t = api::new_timestamp();
+        if (cell) {
+            t = std::max(t, cell->as_atomic_cell().timestamp());
+        }
+        cells.apply(version_col, atomic_cell::make_dead(t, gc_clock::now()));
+    }
+}
+
 static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
 {
   return seastar::async([&proxy, mutations = std::move(mutations), do_flush] () mutable {
@@ -737,6 +786,9 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
       for (auto&& mutation : mutations) {
           keyspaces.emplace(value_cast<sstring>(utf8_type->deserialize(mutation.key().get_component(*s, 0))));
           column_families.emplace(mutation.column_family_id());
+           // We must force recalculation of schema version after the merge, since the resulting
+           // schema may be a mix of the old and new schemas.
+           delete_schema_version(mutation);
       }

       // current state of the schema
@@ -749,6 +801,15 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
       /*auto& old_aggregates = */read_schema_for_keyspaces(proxy, AGGREGATES, keyspaces).get0();
 #endif

+       // Incoming mutations have the version field deleted. Delete here as well so that
+       // schemas which are otherwise equal don't appear as differing.
+       for (auto&& e : old_column_families) {
+           schema_mutations& sm = e.second;
+           if (sm.scylla_tables()) {
+               delete_schema_version(*sm.scylla_tables());
+           }
+       }
+
       proxy.local().mutate_locally(std::move(mutations)).get0();

       if (do_flush) {
@@ -1387,7 +1448,7 @@ static void add_table_params_to_mutations(mutation& m, const clustering_key& cke

    {
        auto map = table->compaction_strategy_options();
-        map["class"] = sstables::compaction_strategy::name(table->compaction_strategy());
+        map["class"] = sstables::compaction_strategy::name(table->configured_compaction_strategy());
        store_map(m, ckey, "compaction", timestamp, map);
    }

@@ -1461,6 +1522,15 @@ static void add_dropped_column_to_schema_mutation(schema_ptr table, const sstrin
    m.set_clustered_cell(ckey, "type", expand_user_type(column.type)->as_cql3_type()->to_string(), timestamp);
 }

+mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type timestamp) {
+    schema_ptr s = tables();
+    auto pkey = partition_key::from_singular(*s, table->ks_name());
+    auto ckey = clustering_key::from_singular(*s, table->cf_name());
+    mutation m(pkey, scylla_tables());
+    m.set_clustered_cell(ckey, "version", utils::UUID(table->version()), timestamp);
+    return m;
+}
+
 static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_type timestamp, bool with_columns_and_triggers)
 {
    // When adding new schema properties, don't set cells for default values so that
@@ -1474,6 +1544,8 @@ static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_ty
    auto ckey = clustering_key::from_singular(*s, table->cf_name());
    m.set_clustered_cell(ckey, "id", table->id(), timestamp);

+    auto scylla_tables_mutation = make_scylla_tables_mutation(table, timestamp);
+
    {
        list_type_impl::native_type flags;
        if (table->is_super()) {
@@ -1499,7 +1571,7 @@ static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_ty
    mutation indices_mutation(pkey, indexes());

    if (with_columns_and_triggers) {
-        for (auto&& column : table->all_columns()) {
+        for (auto&& column : table->v3().all_columns()) {
            add_column_to_schema_mutation(table, column, timestamp, columns_mutation);
        }
        for (auto&& index : table->indices()) {
@@ -1512,7 +1584,8 @@ static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_ty
        }
    }

-    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation)};
+    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation),
+                            std::move(scylla_tables_mutation)};
 }

 void add_table_or_view_to_schema_mutation(schema_ptr s, api::timestamp_type timestamp, bool with_columns, std::vector<mutation>& mutations)
@@ -1561,23 +1634,23 @@ static void make_update_columns_mutations(schema_ptr old_table,
        std::vector<mutation>& mutations) {
    mutation columns_mutation(partition_key::from_singular(*columns(), old_table->ks_name()), columns());

-    auto diff = difference(old_table->columns_by_name(), new_table->columns_by_name());
+    auto diff = difference(old_table->v3().columns_by_name(), new_table->v3().columns_by_name());

    // columns that are no longer needed
    for (auto&& name : diff.entries_only_on_left) {
        // Thrift only knows about the REGULAR ColumnDefinition type, so don't consider other type
        // are being deleted just because they are not here.
-        const column_definition& column = *old_table->columns_by_name().at(name);
+        const column_definition& column = *old_table->v3().columns_by_name().at(name);
        if (from_thrift && !column.is_regular()) {
            continue;
        }

-        drop_column_from_schema_mutation(old_table, column, timestamp, mutations);
+        drop_column_from_schema_mutation(columns(), old_table, column.name_as_text(), timestamp, mutations);
    }

    // newly added columns and old columns with updated attributes
    for (auto&& name : boost::range::join(diff.entries_differing, diff.entries_only_on_right)) {
-        const column_definition& column = *new_table->columns_by_name().at(name);
+        const column_definition& column = *new_table->v3().columns_by_name().at(name);
        add_column_to_schema_mutation(new_table, column, timestamp, columns_mutation);
    }

@@ -1588,7 +1661,7 @@ static void make_update_columns_mutations(schema_ptr old_table,

    // newly dropped columns
    // columns added then dropped again
-    for (auto& name : dc_diff.entries_only_on_right) {
+    for (auto& name : boost::range::join(dc_diff.entries_differing, dc_diff.entries_only_on_right)) {
        add_drop_column_to_mutations(new_table, name, new_table->dropped_columns().at(name), timestamp, mutations);
    }
 }
@@ -1626,12 +1699,20 @@ static void make_drop_table_or_view_mutations(schema_ptr schema_table,
            api::timestamp_type timestamp,
            std::vector<mutation>& mutations) {
    auto pkey = partition_key::from_singular(*schema_table, table_or_view->ks_name());
-    mutation m{std::move(pkey), schema_table};
+    mutation m{pkey, schema_table};
    auto ckey = clustering_key::from_singular(*schema_table, table_or_view->cf_name());
-    m.partition().apply_delete(*schema_table, std::move(ckey), tombstone(timestamp, gc_clock::now()));
+    m.partition().apply_delete(*schema_table, ckey, tombstone(timestamp, gc_clock::now()));
    mutations.emplace_back(m);
-    for (auto &column : table_or_view->all_columns()) {
-        drop_column_from_schema_mutation(table_or_view, column, timestamp, mutations);
+    for (auto& column : table_or_view->v3().all_columns()) {
+        drop_column_from_schema_mutation(columns(), table_or_view, column.name_as_text(), timestamp, mutations);
+    }
+    for (auto& column : table_or_view->dropped_columns() | boost::adaptors::map_keys) {
+        drop_column_from_schema_mutation(dropped_columns(), table_or_view, column, timestamp, mutations);
+    }
+    {
+        mutation m{pkey, scylla_tables()};
+        m.partition().apply_delete(*scylla_tables(), ckey, tombstone(timestamp, gc_clock::now()));
+        mutations.emplace_back(m);
    }
 }

@@ -1655,17 +1736,14 @@ future<std::vector<mutation>> make_drop_table_mutations(lw_shared_ptr<keyspace_m

 static future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy, const qualified_name& table, schema_ptr s)
 {
-    return read_schema_partition_for_table(proxy, s, table.keyspace_name, table.table_name)
-        .then([&proxy, table] (mutation cf_m) {
-            return read_schema_partition_for_table(proxy, columns(), table.keyspace_name, table.table_name)
-                .then([&proxy, table, cf_m = std::move(cf_m)] (mutation col_m) {
-                return read_schema_partition_for_table(proxy, dropped_columns(), table.keyspace_name, table.table_name)
-                    .then([&proxy, table, cf_m = std::move(cf_m), col_m = std::move(col_m)] (mutation dropped_m) {
-                        return read_schema_partition_for_table(proxy, indexes(), table.keyspace_name, table.table_name)
-                            .then([cf_m = std::move(cf_m), col_m = std::move(col_m), dropped_m = std::move(dropped_m)] (mutation idx_m) {
-                                return schema_mutations{std::move(cf_m), std::move(col_m), std::move(idx_m), std::move(dropped_m)};
-                        });
-                    });
+    return when_all_succeed(
+        read_schema_partition_for_table(proxy, s, table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, columns(), table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, dropped_columns(), table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, indexes(), table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, scylla_tables(), table.keyspace_name, table.table_name)).then(
+            [] (mutation cf_m, mutation col_m, mutation dropped_m, mutation idx_m, mutation st_m) {
+                return schema_mutations{std::move(cf_m), std::move(col_m), std::move(idx_m), std::move(dropped_m), std::move(st_m)};
            });
 #if 0
        // FIXME:
@@ -1680,7 +1758,6 @@ static future<schema_mutations> read_table_mutations(distributed<service::storag
        throw new RuntimeException(e);
    }
 #endif
-    });
 }

 future<schema_ptr> create_table_from_name(distributed<service::storage_proxy>& proxy, const sstring& keyspace, const sstring& table)
@@ -1771,7 +1848,7 @@ static void prepare_builder_from_table_row(schema_builder& builder, const query:
            builder.set_min_compaction_threshold(std::stoi(map["min_threshold"]));
        }
        if (map.count("enabled")) {
-            // TODO: enable/disable?
+            builder.set_compaction_enabled(boost::algorithm::iequals(map["enabled"], "true"));
        }

        builder.set_compaction_strategy_options(map);
@@ -1870,13 +1947,12 @@ schema_ptr create_table_from_mutations(schema_mutations sm, std::experimental::o

    prepare_builder_from_table_row(builder, table_row);

-    for (auto&& cdef : column_defs) {
-        builder.with_column(cdef);
-    }
+    v3_columns columns(std::move(column_defs), is_dense, is_compound);
+    columns.apply_to(builder);

    std::vector<index_metadata> index_defs;
    if (sm.indices_mutation()) {
-        index_defs = create_indices_from_index_rows(query::result_set(sm.indices_mutation().value()), ks_name, cf_name);
+        index_defs = create_indices_from_index_rows(query::result_set(*sm.indices_mutation()), ks_name, cf_name);
    }
    for (auto&& index : index_defs) {
        builder.with_index(index);
@@ -1909,7 +1985,8 @@ static void add_column_to_schema_mutation(schema_ptr table,
                                   api::timestamp_type timestamp,
                                   mutation& m)
 {
-    auto ckey = clustering_key::from_exploded(*m.schema(), {utf8_type->decompose(table->cf_name()), column.name()});
+    auto ckey = clustering_key::from_exploded(*m.schema(), {utf8_type->decompose(table->cf_name()),
+                                                            utf8_type->decompose(column.name_as_text())});

    auto order = "NONE";
    if (column.is_clustering_key()) {
@@ -2003,13 +2080,19 @@ static void drop_index_from_schema_mutation(schema_ptr table, const index_metada
    mutations.push_back(std::move(m));
 }

-static void drop_column_from_schema_mutation(schema_ptr table, const column_definition& column, long timestamp, std::vector<mutation>& mutations) {
-    schema_ptr s = columns();
-    auto pkey = partition_key::from_singular(*s, table->ks_name());
-    auto ckey = clustering_key::from_exploded(*s, {utf8_type->decompose(table->cf_name()), column.name()});
+static void drop_column_from_schema_mutation(
+        schema_ptr schema_table,
+        schema_ptr table,
+        const sstring& column_name,
+        long timestamp,
+        std::vector<mutation>& mutations)
+{
+    auto pkey = partition_key::from_singular(*schema_table, table->ks_name());
+    auto ckey = clustering_key::from_exploded(*schema_table, {utf8_type->decompose(table->cf_name()),
+                                                              utf8_type->decompose(column_name)});

-    mutation m{pkey, s};
-    m.partition().apply_delete(*s, ckey, tombstone(timestamp, gc_clock::now()));
+    mutation m{pkey, schema_table};
+    m.partition().apply_delete(*schema_table, ckey, tombstone(timestamp, gc_clock::now()));
    mutations.emplace_back(m);
 }

@@ -2153,7 +2236,7 @@ static schema_mutations make_view_mutations(view_ptr view, api::timestamp_type t
    mutation indices_mutation(pkey, indexes());

    if (with_columns) {
-        for (auto&& column : view->all_columns()) {
+        for (auto&& column : view->v3().all_columns()) {
            add_column_to_schema_mutation(view, column, timestamp, columns_mutation);
        }

@@ -2165,7 +2248,10 @@ static schema_mutations make_view_mutations(view_ptr view, api::timestamp_type t
        }
    }

-    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation)};
+    auto scylla_tables_mutation = make_scylla_tables_mutation(view, timestamp);
+
+    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation),
+                            std::move(scylla_tables_mutation)};
 }

 schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timestamp, bool with_columns)
@@ -2459,10 +2545,33 @@ data_type parse_type(sstring str)

 std::vector<schema_ptr> all_tables() {
    return {
-        keyspaces(), tables(), columns(), dropped_columns(), triggers(),
+        keyspaces(), tables(), scylla_tables(), columns(), dropped_columns(), triggers(),
        views(), indexes(), types(), functions(), aggregates(),
    };
 }

+namespace legacy {
+
+table_schema_version schema_mutations::digest() const {
+    md5_hasher h;
+    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies);
+    db::schema_tables::feed_hash_for_schema_digest(h, _columns);
+    return utils::UUID_gen::get_name_UUID(h.finalize());
+}
+
+future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy,
+    sstring keyspace_name, sstring table_name, schema_ptr s)
+{
+    return read_schema_partition_for_table(proxy, s, keyspace_name, table_name)
+        .then([&proxy, keyspace_name, table_name] (mutation cf_m) {
+            return read_schema_partition_for_table(proxy, db::system_keyspace::legacy::columns(), keyspace_name, table_name)
+                .then([cf_m = std::move(cf_m)] (mutation col_m) {
+                    return schema_mutations{std::move(cf_m), std::move(col_m)};
+                });
+        });
+}
+
+} // namespace legacy
+
 } // namespace schema_tables
 } // namespace schema
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -64,6 +64,7 @@ namespace v3 {
 static constexpr auto NAME = "system_schema";
 static constexpr auto KEYSPACES = "keyspaces";
 static constexpr auto TABLES = "tables";
+static constexpr auto SCYLLA_TABLES = "scylla_tables";
 static constexpr auto COLUMNS = "columns";
 static constexpr auto DROPPED_COLUMNS = "dropped_columns";
 static constexpr auto TRIGGERS = "triggers";
@@ -77,16 +78,43 @@ schema_ptr columns();
 schema_ptr dropped_columns();
 schema_ptr indexes();
 schema_ptr tables();
+schema_ptr scylla_tables();
 schema_ptr views();

 }

+namespace legacy {
+
+class schema_mutations {
+    mutation _columnfamilies;
+    mutation _columns;
+public:
+    schema_mutations(mutation columnfamilies, mutation columns)
+        : _columnfamilies(std::move(columnfamilies))
+        , _columns(std::move(columns))
+    { }
+    table_schema_version digest() const;
+};
+
+future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy,
+    sstring keyspace_name, sstring table_name, schema_ptr s);
+
+}
+
 using namespace v3;

+// Change on non-backwards compatible changes of schema mutations.
+// Replication of schema between nodes with different version is inhibited.
+extern const sstring version;
+
 extern std::vector<const char*> ALL;

 std::vector<schema_ptr> all_tables();

+// saves/creates "ks" + all tables etc, while first deleting all old schema entries (will be rewritten)
+future<> save_system_schema(const sstring & ks);
+
+// saves/creates "system_schema" keyspace
 future<> save_system_keyspace_schema();

 future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>& proxy);
@@ -137,6 +165,7 @@ view_ptr create_view_from_mutations(schema_mutations, std::experimental::optiona
 future<std::vector<view_ptr>> create_views_from_schema_partition(distributed<service::storage_proxy>& proxy, const schema_result::mapped_type& result);

 schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timestamp, bool with_columns);
+mutation make_scylla_tables_mutation(schema_ptr, api::timestamp_type timestamp);

 void add_table_or_view_to_schema_mutation(schema_ptr view, api::timestamp_type timestamp, bool with_columns, std::vector<mutation>& mutations);

@@ -153,15 +182,11 @@ data_type parse_type(sstring str);
 sstring serialize_index_kind(index_metadata_kind kind);
 index_metadata_kind deserialize_index_kind(sstring kind);

+mutation compact_for_schema_digest(const mutation& m);
+
 template<typename Hasher>
 void feed_hash_for_schema_digest(Hasher& h, const mutation& m) {
-    // Cassandra is skipping tombstones from digest calculation
-    // to avoid disagreements due to tombstone GC.
-    // See https://issues.apache.org/jira/browse/CASSANDRA-6862.
-    // We achieve similar effect with compact_for_compaction().
-    mutation m_compacted(m);
-    m_compacted.partition().compact_for_compaction(*m.schema(), always_gc, gc_clock::time_point::max());
-    feed_hash(h, m_compacted);
+    feed_hash(h, compact_for_schema_digest(m));
 }

 } // namespace schema_tables
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1044,6 +1044,9 @@ future<> setup(distributed<database>& db, distributed<cql3::query_processor>& qp
        return check_health();
    }).then([] {
        return db::schema_tables::save_system_keyspace_schema();
+    }).then([] {
+        // #2514 - make sure "system" is written to system_schema.keyspaces.
+        return db::schema_tables::save_system_schema(NAME);
    }).then([] {
        return netw::get_messaging_service().invoke_on_all([] (auto& ms){
            return ms.init_local_preferred_ip_cache();
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -62,6 +62,8 @@ namespace cql3 {
    class query_processor;
 }

+bool is_system_keyspace(const sstring& ks_name);
+
 namespace db {
 namespace system_keyspace {

@@ -120,6 +122,18 @@ extern schema_ptr hints();
 extern schema_ptr batchlog();
 extern schema_ptr built_indexes(); // TODO (from Cassandra): make private

+namespace legacy {
+
+schema_ptr keyspaces();
+schema_ptr column_families();
+schema_ptr columns();
+schema_ptr triggers();
+schema_ptr usertypes();
+schema_ptr functions();
+schema_ptr aggregates();
+
+}
+
 table_schema_version generate_schema_version(utils::UUID table_id);

 // Only for testing.
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -260,6 +260,27 @@ unsigned shard_of(const token& t) {
    return global_partitioner().shard_of(t);
 }

+stdx::optional<dht::token_range>
+selective_token_range_sharder::next() {
+    if (_done) {
+        return {};
+    }
+    while (_range.overlaps(dht::token_range(_start_boundary, {}), dht::token_comparator())
+            && !(_start_boundary && _start_boundary->value() == maximum_token())) {
+        auto end_token = _partitioner.token_for_next_shard(_start_token, _next_shard);
+        auto candidate = dht::token_range(std::move(_start_boundary), range_bound<dht::token>(end_token, false));
+        auto intersection = _range.intersection(std::move(candidate), dht::token_comparator());
+        _start_token = _partitioner.token_for_next_shard(end_token, _shard);
+        _start_boundary = range_bound<dht::token>(_start_token);
+        if (intersection) {
+            return *intersection;
+        }
+    }
+
+    _done = true;
+    return {};
+}
+
 stdx::optional<ring_position_range_and_shard>
 ring_position_range_sharder::next(const schema& s) {
    if (_done) {
@@ -462,14 +483,13 @@ int ring_position_comparator::operator()(ring_position_view lh, ring_position_vi
    }
 }

-int ring_position_comparator::operator()(ring_position_view lh, sstables::key_view rh) const {
-    auto rh_token = global_partitioner().get_token(rh);
-    auto token_cmp = tri_compare(*lh._token, rh_token);
+int ring_position_comparator::operator()(ring_position_view lh, sstables::decorated_key_view rh) const {
+    auto token_cmp = tri_compare(*lh._token, rh.token());
    if (token_cmp) {
        return token_cmp;
    }
    if (lh._key) {
-        auto rel = rh.tri_compare(s, *lh._key);
+        auto rel = rh.key().tri_compare(s, *lh._key);
        if (rel) {
            return -rel;
        }
@@ -477,7 +497,7 @@ int ring_position_comparator::operator()(ring_position_view lh, sstables::key_vi
    return lh._weight;
 }

-int ring_position_comparator::operator()(sstables::key_view a, ring_position_view b) const {
+int ring_position_comparator::operator()(sstables::decorated_key_view a, ring_position_view b) const {
    return -(*this)(b, a);
 }

--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -55,6 +55,7 @@
 namespace sstables {

 class key_view;
+class decorated_key_view;

 }

@@ -547,8 +548,8 @@ struct ring_position_comparator {
    const schema& s;
    ring_position_comparator(const schema& s_) : s(s_) {}
    int operator()(ring_position_view, ring_position_view) const;
-    int operator()(ring_position_view, sstables::key_view) const;
-    int operator()(sstables::key_view, ring_position_view) const;
+    int operator()(ring_position_view, sstables::decorated_key_view) const;
+    int operator()(sstables::decorated_key_view, ring_position_view) const;
 };

 // "less" comparator giving the same order as ring_position_comparator
@@ -671,6 +672,29 @@ split_ranges_to_shards(const dht::token_range_vector& ranges, const schema& s);
 std::vector<partition_range> split_range_to_single_shard(const schema& s, const dht::partition_range& pr, shard_id shard);
 std::vector<partition_range> split_range_to_single_shard(const i_partitioner& partitioner, const schema& s, const dht::partition_range& pr, shard_id shard);

+class selective_token_range_sharder {
+    const i_partitioner& _partitioner;
+    dht::token_range _range;
+    shard_id _shard;
+    bool _done = false;
+    shard_id _next_shard;
+    dht::token _start_token;
+    stdx::optional<range_bound<dht::token>> _start_boundary;
+public:
+    explicit selective_token_range_sharder(dht::token_range range, shard_id shard)
+            : selective_token_range_sharder(global_partitioner(), std::move(range), shard) {}
+    selective_token_range_sharder(const i_partitioner& partitioner, dht::token_range range, shard_id shard)
+            : _partitioner(partitioner)
+            , _range(std::move(range))
+            , _shard(shard)
+            , _next_shard(_shard + 1 == _partitioner.shard_count() ? 0 : _shard + 1)
+            , _start_token(_range.start() ? _range.start()->value() : minimum_token())
+            , _start_boundary(_partitioner.shard_of(_start_token) == shard ?
+                _range.start() : range_bound<dht::token>(_partitioner.token_for_next_shard(_start_token, shard))) {
+    }
+    stdx::optional<dht::token_range> next();
+};
+
 } // dht

 namespace std {
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -79,13 +79,14 @@ if [ $LOCALRPM -eq 1 ]; then
            cd ../..
            cp build/scylla-jmx/build/rpms/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
        fi
-        if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ]; then
+        if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ] || [ ! -f dist/ami/files/scylla-tools-core.noarch.rpm ]; then
            cd build
            git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
            cd scylla-tools-java
            sh -x -e dist/redhat/build_rpm.sh
            cd ../..
            cp build/scylla-tools-java/build/rpms/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
+            cp build/scylla-tools-java/build/rpms/scylla-tools-core-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools-core.noarch.rpm
        fi
    else
        sudo apt-get install -y git
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/scripts/node_health_check
+++ b/dist/common/scripts/node_health_check
@@ -75,13 +75,16 @@ while getopts ":hdncap:q:" opt; do
 done


-##Check if server is Fedora/Debian release##
-cat /etc/os-release | grep fedora &> /dev/null
+##Check server release (Fedora/Oracle/Debian)##
+cat /etc/os-release | grep -i fedora &> /dev/null
 if [ $? -ne 0 ]; then
-    IS_FEDORA="1"
+    cat /etc/os-release | grep -i oracle &> /dev/null
+    if [ $? -ne 0 ]; then
+        IS_FEDORA="1"
+    fi
 fi

-cat /etc/os-release | grep debian &> /dev/null
+cat /etc/os-release | grep -i debian &> /dev/null
 if [ $? -ne 0 ]; then
    IS_DEBIAN="1"
 fi
@@ -91,25 +94,24 @@ if [ "$IS_FEDORA" == "1" ] && [ "$IS_DEBIAN" == "1" ]; then
    exit 222
 fi

-##Pass criteria for script execution##
-#Check scylla service#
+##Scylla-server service status##
 echo "--------------------------------------------------"
-echo "Checking Scylla Service"
+echo "Checking Scylla-server Service"
 echo "--------------------------------------------------"

 ps -C scylla --no-headers &> /dev/null
 if [ $? -ne 0 ]; then
    SCYLLA_SERVICE="1"
-    echo "ERROR: Scylla is NOT Running"
+    echo "ERROR: Scylla-server is NOT Running"
    echo "Cannot Collect Data Model Info"
    echo "--------------------------------------------------"
 else
-    echo "Scylla Service: OK"
+    echo "Scylla-server Service: OK"
    echo "--------------------------------------------------"
 fi 


-#Check Scylla-JMX service#
+##Scylla-JMX service status##
 echo "Checking Scylla-JMX Service on Port $JMX_PORT"
 echo "--------------------------------------------------"

@@ -121,7 +123,7 @@ if [ $? -ne 0 ]; then
    echo "Use the '-p' Option to Provide the Scylla-JMX Port"
    echo "--------------------------------------------------"
 else
-    echo "JMX Service (nodetool): OK"
+    echo "Scylla-JMX Service (nodetool): OK"
    echo "--------------------------------------------------"
 fi 

@@ -152,12 +154,12 @@ mkdir -p $OUTPUT_PATH1 $OUTPUT_PATH2 $OUTPUT_PATH3 $OUTPUT_PATH4 $OUTPUT_PATH5
 #System Checks#
 echo "Collecting System Info"
 echo "--------------------------------------------------"
-cat /etc/os-release > $OUTPUT_PATH1/os-release.txt
+cp -p /etc/os-release $OUTPUT_PATH1
 uname -r > $OUTPUT_PATH1/kernel-release.txt
 lscpu > $OUTPUT_PATH1/cpu-info.txt
 vmstat -s -S M | awk '{$1=$1};1' > $OUTPUT_PATH1/vmstat.txt
 df -Th > $OUTPUT_PATH1/capacity-info.txt && echo "" >> $OUTPUT_PATH1/capacity-info.txt && sudo du -sh /var/lib/scylla/* >> $OUTPUT_PATH1/capacity-info.txt
-cat /proc/mdstat > $OUTPUT_PATH1/raid-conf.txt
+cp -p /proc/mdstat $OUTPUT_PATH1
 for f in `sudo find /sys -name scheduler`; do echo -n "$f: "; cat  $f; done > $OUTPUT_PATH1/io-sched-conf.txt && echo "" >> $OUTPUT_PATH1/io-sched-conf.txt
 for f in `sudo find /sys -name nomerges`; do echo -n "$f: "; cat  $f; done >> $OUTPUT_PATH1/io-sched-conf.txt

@@ -166,30 +168,23 @@ for f in `sudo find /sys -name nomerges`; do echo -n "$f: "; cat  $f; done >> $O
 echo "Collecting Scylla Info"
 echo "--------------------------------------------------"

+scylla --version > $OUTPUT_PATH2/scylla-version.txt
+cp -p /etc/scylla/* $OUTPUT_PATH2
+ls -ltrh /var/lib/scylla/coredump/ > $OUTPUT_PATH2/coredump-folder.txt
+
 if [ "$IS_FEDORA" == "0" ]; then
    rpm -qa | grep -i scylla > $OUTPUT_PATH2/scylla-pkgs.txt
+    cp -p /etc/sysconfig/scylla-server $OUTPUT_PATH2
 fi

 if [ "$IS_DEBIAN" == "0" ]; then
    dpkg -l | grep -i scylla > $OUTPUT_PATH2/scylla-pkgs.txt
+    cp -p /etc/default/scylla-server $OUTPUT_PATH2
 fi

-curl -s -X GET "http://localhost:10000/storage_service/scylla_release_version" > $OUTPUT_PATH2/scylla-version.txt && echo "" >> $OUTPUT_PATH2/scylla-version.txt
-cat /etc/scylla/scylla.yaml | grep -v "#" | grep -v "^[[:space:]]*$" > $OUTPUT_PATH2/scylla-yaml.txt
-
-if [ "$IS_FEDORA" == "0" ]; then
-    cat /etc/sysconfig/scylla-server | grep -v "^[[:space:]]*$" > $OUTPUT_PATH2/scylla-server.txt
-fi
-
-if [ "$IS_DEBIAN" == "0" ]; then
-    cat /etc/default/scylla-server | grep -v "^[[:space:]]*$" > $OUTPUT_PATH2/scylla-server.txt
-fi
-
-cat /etc/scylla/cassandra-rackdc.properties | grep -v "#" |grep -v "^[[:space:]]*$" > $OUTPUT_PATH2/multi-DC.txt
-ls -ltrh /var/lib/scylla/coredump/ > $OUTPUT_PATH2/coredump-folder.txt
-

 #Scylla Logs#
+echo "--------------------------------------------------"
 echo "Collecting Logs"
 echo "--------------------------------------------------"

@@ -256,7 +251,7 @@ for i in `ls -I lo /sys/class/net/`; do echo "--$i"; cat /sys/class/net/$i/queue
 for i in `ls -I lo /sys/class/net/`; do echo "--$i"; cat /sys/class/net/$i/queues/rx-*/rps_flow_cnt; echo ""; done > $OUTPUT_PATH5/rfs-conf.txt
 ps -elf | grep irqbalance > $OUTPUT_PATH5/irqbalance-conf.txt
 sudo sysctl -a > $OUTPUT_PATH5/sysctl.txt 2>&1
-sudo iptables -L > $OUTPUT_PATH5/iptables.txt
+sudo iptables -L -v > $OUTPUT_PATH5/iptables.txt
 netstat -an | grep tcp > $OUTPUT_PATH5/netstat-tcp.txt


@@ -297,7 +292,7 @@ echo "" >> $REPORT

 echo "Host Operating System" >> $REPORT
 echo "---------------------" >> $REPORT
-cat $OUTPUT_PATH1/os-release.txt >> $REPORT
+cat $OUTPUT_PATH1/os-release >> $REPORT
 echo "" >> $REPORT
 echo "" >> $REPORT

@@ -327,7 +322,7 @@ echo "" >> $REPORT

 echo "RAID Configuration" >> $REPORT
 echo "------------------" >> $REPORT
-cat $OUTPUT_PATH1/raid-conf.txt >> $REPORT
+cat $OUTPUT_PATH1/mdstat >> $REPORT
 echo "" >> $REPORT
 echo "" >> $REPORT

@@ -354,7 +349,7 @@ echo "" >> $REPORT
 echo "Configuration files" >> $REPORT
 echo "-------------------" >> $REPORT
 echo "## /etc/scylla/scylla.yaml ##" >> $REPORT
-cat $OUTPUT_PATH2/scylla-yaml.txt >> $REPORT
+cat $OUTPUT_PATH2/scylla.yaml | grep -v "#" | grep -v "^[[:space:]]*$" >> $REPORT
 echo "" >> $REPORT
 echo "" >> $REPORT

@@ -366,11 +361,11 @@ if [ "$IS_DEBIAN" == "0" ]; then
    echo "## /etc/default/scylla-server ##" >> $REPORT
 fi

-cat $OUTPUT_PATH2/scylla-server.txt >> $REPORT
+cat $OUTPUT_PATH2/scylla-server | grep -v "^[[:space:]]*$" >> $REPORT
 echo "" >> $REPORT
 echo "" >> $REPORT
 echo "## /etc/scylla/cassandra-rackdc.properties ##" >> $REPORT
-cat $OUTPUT_PATH2/multi-DC.txt >> $REPORT
+cat $OUTPUT_PATH2/cassandra-rackdc.properties | grep -v "#" |grep -v "^[[:space:]]*$" >> $REPORT
 echo "" >> $REPORT
 echo "" >> $REPORT

--- a/dist/common/scripts/scylla_cpuscaling_setup
+++ b/dist/common/scripts/scylla_cpuscaling_setup
@@ -4,6 +4,10 @@

 . /usr/lib/scylla/scylla_lib.sh

+if [ ! -f /sys/devices/system/cpu/cpufreq/policy0/scaling_governor ]; then
+    echo "This computer doesn't supported CPU scaling configuration."
+    exit 0
+fi
 if is_debian_variant; then
    apt-get install -y cpufrequtils
    service cpufrequtils stop
--- a/dist/common/scripts/scylla_cpuset_setup
+++ b/dist/common/scripts/scylla_cpuset_setup
@@ -2,6 +2,8 @@
 #
 #  Copyright (C) 2016 ScyllaDB

+. /usr/lib/scylla/scylla_lib.sh
+
 print_usage() {
    echo "scylla_cpuset_setup --cpuset 1-7 --smp 7"
    echo "  --cpuset   CPUs to use (in cpuset(7) format; default: all))"
@@ -38,5 +40,6 @@ fi
 if [ "$SMP" != "" ]; then
    OUT="$OUT--smp $SMP "
 fi
+rm -f /etc/scylla.d/perftune.yaml
 OUT="$OUT\""
 echo $OUT > /etc/scylla.d/cpuset.conf
--- a/dist/common/scripts/scylla_lib.sh
+++ b/dist/common/scripts/scylla_lib.sh
@@ -38,6 +38,51 @@ ec2_is_supported_instance_type() {
    esac
 }

+#
+# check_cpuset_conf <NIC name>
+#
+get_tune_mode() {
+    local nic=$1
+
+    # if cpuset.conf doesn't exist use the default mode
+    [[ ! -e '/etc/scylla.d/cpuset.conf' ]] && return
+
+    local cur_cpuset=`cat /etc/scylla.d/cpuset.conf | cut -d "\"" -f2- | cut -d" " -f2`
+    local mq_cpuset=`/usr/lib/scylla/perftune.py --tune net --nic "$nic" --mode mq --get-cpu-mask | /usr/lib/scylla/hex2list.py`
+    local sq_cpuset=`/usr/lib/scylla/perftune.py --tune net --nic "$nic" --mode sq --get-cpu-mask | /usr/lib/scylla/hex2list.py`
+    local sq_split_cpuset=`/usr/lib/scylla/perftune.py --tune net --nic "$nic" --mode sq_split --get-cpu-mask | /usr/lib/scylla/hex2list.py`
+    local tune_mode=""
+
+    case "$cur_cpuset" in
+        "$mq_cpuset")
+            tune_mode="--mode mq"
+            ;;
+        "$sq_cpuset")
+            tune_mode="--mode sq"
+            ;;
+        "$sq_split_cpuset")
+            tune_mode="--mode sq_split"
+            ;;
+    esac
+
+    # if cpuset is something different from what we expect - use the default mode
+    echo "$tune_mode"
+}
+
+#
+# create_perftune_conf [<NIC name>]
+#
+create_perftune_conf() {
+    local nic=$1
+    [[ -z "$nic" ]] && nic='eth0'
+
+    # if exists - do nothing
+    [[ -e '/etc/scylla.d/perftune.yaml' ]] && return
+
+    local mode=`get_tune_mode "$nic"`
+    /usr/lib/scylla/perftune.py --tune net --nic "$nic" $mode --dump-options-file > /etc/scylla.d/perftune.yaml
+}
+
 . /etc/os-release
 if is_debian_variant || is_gentoo_variant; then
    SYSCONFIG=/etc/default
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -22,7 +22,8 @@ elif [ "$NETWORK_MODE" = "dpdk" ]; then
    done
 else # NETWORK_MODE = posix
    if [ "$SET_NIC" = "yes" ]; then
-        /usr/lib/scylla/posix_net_conf.sh $IFNAME
+        create_perftune_conf "$IFNAME"
+        /usr/lib/scylla/posix_net_conf.sh $IFNAME --options-file /etc/scylla.d/perftune.yaml
    fi
 fi
 if [ "$ID" = "ubuntu" ]; then
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -104,7 +104,11 @@ else
    mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
    mkfs.xfs $RAID -f -K
 fi
-mdadm --detail --scan > /etc/mdadm.conf
+if is_debian_variant; then
+    mdadm --detail --scan > /etc/mdadm/mdadm.conf
+else
+    mdadm --detail --scan > /etc/mdadm.conf
+fi

 mkdir -p "$MOUNT_AT"
 mount -t xfs -o noatime $RAID "$MOUNT_AT"
@@ -122,3 +126,7 @@ if [ $FSTAB -ne 0 ]; then
    UUID=`blkid $RAID | awk '{print $2}'`
    echo "$UUID $MOUNT_AT xfs noatime 0 0" >> /etc/fstab
 fi
+
+if is_debian_variant; then
+    update-initramfs -u
+fi
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -75,7 +75,7 @@ verify_package() {
    if is_debian_variant; then
        dpkg -s $1 > /dev/null 2>&1 &&:
    elif is_gentoo_variant; then
-        find /var/db/pkg/dev-db -type d -name "${1}-*" | egrep -q ".*"
+        find /var/db/pkg/app-admin -type d -name "${1}-*" | egrep -q ".*"
    else
        rpm -q $1 > /dev/null 2>&1 &&:
    fi
--- a/dist/common/systemd/scylla-server.service.in
+++ b/dist/common/systemd/scylla-server.service.in
@@ -1,6 +1,6 @@
 [Unit]
 Description=Scylla Server
-After=network.target
+After=network-online.target
 Wants=scylla-jmx.service
 Wants=scylla-housekeeping-restart.timer
 Wants=scylla-housekeeping-daily.timer
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -129,9 +129,11 @@ sed -i -e "s/@@CODENAME@@/$TARGET/g" debian/changelog
 cp dist/debian/rules.in debian/rules
 cp dist/debian/control.in debian/control
 cp dist/debian/scylla-server.install.in debian/scylla-server.install
+cp dist/debian/scylla-conf.preinst.in debian/scylla-conf.preinst
+sed -i -e "s/@@VERSION@@/$SCYLLA_VERSION/g" debian/scylla-conf.preinst
 if [ "$TARGET" = "jessie" ]; then
    cp dist/debian/scylla-server.cron.d debian/
-    sed -i -e "s/@@REVISION@@/1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++-5, libunwind-dev/g" debian/control
@@ -145,7 +147,7 @@ if [ "$TARGET" = "jessie" ]; then
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
 elif [ "$TARGET" = "stretch" ] || [ "$TARGET" = "buster" ] || [ "$TARGET" = "sid" ]; then
    cp dist/debian/scylla-server.cron.d debian/
-    sed -i -e "s/@@REVISION@@/1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind8-dev/g" debian/control
@@ -159,7 +161,7 @@ elif [ "$TARGET" = "stretch" ] || [ "$TARGET" = "buster" ] || [ "$TARGET" = "sid
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
 elif [ "$TARGET" = "trusty" ]; then
    cp dist/debian/scylla-server.cron.d debian/
-    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@/--upstart-only/g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/g++-5, libunwind8-dev/g" debian/control
@@ -172,7 +174,7 @@ elif [ "$TARGET" = "trusty" ]; then
    sed -i -e "s#@@SCRIPTS_FSTRIM@@#dist/debian/scripts/scylla_fstrim usr/lib/scylla#g" debian/scylla-server.install
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
 elif [ "$TARGET" = "xenial" ] || [ "$TARGET" = "yakkety" ] || [ "$TARGET" = "zesty" ] || [ "$TARGET" = "artful" ]; then
-    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind-dev/g" debian/control
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -7,7 +7,8 @@ KVER=$(uname -r)
 if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
    echo "kernel $KVER detected, skip running sysctl..."
 else
-    sysctl -p/etc/sysctl.d/99-scylla-sched.conf
+    # expect failures in virtualized environments
+    sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
 fi

 #DEBHELPER#
--- a/dist/debian/debian/scylla-server.postinst
+++ b/dist/debian/debian/scylla-server.postinst
@@ -3,12 +3,22 @@
 set -e

 if [ "$1" = configure ]; then
-    adduser --system \
-            --quiet \
-            --home /var/lib/scylla \
-            --no-create-home \
-            --disabled-password \
-            --group scylla
+    getent passwd scylla || NOUSR=1
+    getent group scylla || NOGRP=1
+
+    # this handles both case group is not exist || group already exists
+    if [ $NOUSR ]; then
+        adduser --system \
+                --quiet \
+                --home /var/lib/scylla \
+                --no-create-home \
+                --disabled-password \
+                --group scylla
+    # only group is not exist, create it and add user to the group
+    elif [ $NOGRP ]; then
+        addgroup --system scylla
+        adduser scylla scylla
+    fi
    chown -R scylla:scylla /var/lib/scylla
    chown -R scylla:scylla /var/lib/scylla-housekeeping
 fi
--- a/dist/debian/scylla-conf.preinst.in
+++ b/dist/debian/scylla-conf.preinst.in
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+ver=$(dpkg -l|grep scylla-server|awk '{print $3}'|sed -e "s/-.*$//")
+if [ -n "$ver" ]; then
+    ver_fmt=$(echo $ver | awk -F. '{printf "%d%02d%02d", $1,$2,$3}')
+    if [ $ver_fmt -lt 10703 ]; then
+        # for <scylla-1.2
+        if [ ! -f /usr/lib/scylla/scylla_config_get.py ]; then
+            echo
+            echo "Error: Upgrading from scylla-$ver to scylla-@@VERSION@@ is not supported."
+            echo "Please upgrade to scylla-1.7.3 or later, before upgrade to @@VERSION@@."
+            echo
+            exit 1
+        fi
+        commitlog_directory=$(/usr/lib/scylla/scylla_config_get.py -g commitlog_directory)
+        commitlog_files=$(ls $commitlog_directory | wc -l)
+        if [ $commitlog_files -ne 0 ]; then
+            echo
+            echo "Error: Upgrading from scylla-$ver to scylla-@@VERSION@@ is not supported when commitlog is not clean."
+            echo "Please upgrade to scylla-1.7.3 or later, before upgrade to @@VERSION@@."
+            echo "Also make sure $commitlog_directory is empty."
+            echo
+            exit 1
+        fi
+    fi
+fi
+
+#DEBHELPER#
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -7,7 +7,7 @@ ENV container docker
 VOLUME [ "/sys/fs/cgroup" ]

 #install scylla
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-2.0.repo -o /etc/yum.repos.d/scylla.repo
 RUN yum -y install epel-release
 RUN yum -y clean expire-cache
 RUN yum -y update
--- a/dist/docker/redhat/scyllasetup.py
+++ b/dist/docker/redhat/scyllasetup.py
@@ -70,5 +70,7 @@ class ScyllaSetup:
        if self._experimental == "1":
            args += [ "--experimental=on" ]

+        args += ["--blocked-reactor-notify-ms 999999999"]
+
        with open("/etc/scylla.d/docker.conf", "w") as cqlshrc:
            cqlshrc.write("SCYLLA_DOCKER_ARGS=\"%s\"\n" % " ".join(args))
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{name}-@@VERSION@@-@@RELEASE@@.tar
-Requires:       scylla-server scylla-jmx scylla-tools scylla-kernel-conf
+Requires:       scylla-server = @@VERSION@@ scylla-jmx = @@VERSION@@ scylla-tools = @@VERSION@@ scylla-kernel-conf = @@VERSION@@
 Obsoletes:	scylla-server < 1.1

 %description
--- a/gms/application_state.cc
+++ b/gms/application_state.cc
@@ -62,6 +62,7 @@ static const std::map<application_state, sstring> application_state_names = {
    {application_state::TOKENS,                 "TOKENS"},
    {application_state::SUPPORTED_FEATURES,     "SUPPORTED_FEATURES"},
    {application_state::CACHE_HITRATES,         "CACHE_HITRATES"},
+    {application_state::SCHEMA_TABLES_VERSION,  "SCHEMA_TABLES_VERSION"},
 };

 std::ostream& operator<<(std::ostream& os, const application_state& m) {
--- a/gms/application_state.hh
+++ b/gms/application_state.hh
@@ -59,8 +59,8 @@ enum class application_state {
    TOKENS,
    SUPPORTED_FEATURES,
    CACHE_HITRATES,
+    SCHEMA_TABLES_VERSION,
    // pad to allow adding new states to existing cluster
-    X3,
    X4,
    X5,
    X6,
--- a/gms/failure_detector.cc
+++ b/gms/failure_detector.cc
@@ -43,6 +43,7 @@
 #include "gms/endpoint_state.hh"
 #include "gms/application_state.hh"
 #include "gms/inet_address.hh"
+#include "service/storage_service.hh"
 #include "log.hh"
 #include <iostream>
 #include <chrono>
@@ -56,37 +57,13 @@ constexpr std::chrono::milliseconds failure_detector::DEFAULT_MAX_PAUSE;
 using clk = arrival_window::clk;

 static clk::duration get_initial_value() {
-#if 0
-    String newvalue = System.getProperty("cassandra.fd_initial_value_ms");
-    if (newvalue == null)
-    {
-        return Gossiper.intervalInMillis * 2;
-    }
-    else
-    {
-        logger.info("Overriding FD INITIAL_VALUE to {}ms", newvalue);
-        return Integer.parseInt(newvalue);
-    }
-#endif
-    warn(unimplemented::cause::GOSSIP);
-    return std::chrono::seconds(2);
+    auto& cfg = service::get_local_storage_service().db().local().get_config();
+    return std::chrono::milliseconds(cfg.fd_initial_value_ms());
 }

 clk::duration arrival_window::get_max_interval() {
-#if 0
-    sstring newvalue = System.getProperty("cassandra.fd_max_interval_ms");
-    if (newvalue == null)
-    {
-        return failure_detector.INITIAL_VALUE_NANOS;
-    }
-    else
-    {
-        logger.info("Overriding FD MAX_INTERVAL to {}ms", newvalue);
-        return TimeUnit.NANOSECONDS.convert(Integer.parseInt(newvalue), TimeUnit.MILLISECONDS);
-    }
-#endif
-    warn(unimplemented::cause::GOSSIP);
-    return get_initial_value();
+    auto& cfg = service::get_local_storage_service().db().local().get_config();
+    return std::chrono::milliseconds(cfg.fd_max_interval_ms());
 }

 void arrival_window::add(clk::time_point value, const gms::inet_address& ep) {
@@ -95,7 +72,7 @@ void arrival_window::add(clk::time_point value, const gms::inet_address& ep) {
        if (inter_arrival_time <= get_max_interval()) {
            _arrival_intervals.add(inter_arrival_time.count());
        } else  {
-            logger.debug("failure_detector: Ignoring interval time of {} for {}", inter_arrival_time.count(), ep);
+            logger.debug("failure_detector: Ignoring interval time of {} for {}, mean={}, size={}", inter_arrival_time.count(), ep, mean(), size());
        }
    } else {
        // We use a very large initial interval since the "right" average depends on the cluster size
--- a/gms/failure_detector.hh
+++ b/gms/failure_detector.hh
@@ -87,6 +87,8 @@ public:
    // see CASSANDRA-2597 for an explanation of the math at work here.
    double phi(clk::time_point tnow);

+    size_t size() { return _arrival_intervals.size(); }
+
    friend std::ostream& operator<<(std::ostream& os, const arrival_window& w);

 };
--- a/gms/feature.hh
+++ b/gms/feature.hh
@@ -21,6 +21,8 @@

 #pragma once

+#include <seastar/core/shared_future.hh>
+
 namespace gms {

 /**
@@ -31,19 +33,16 @@ namespace gms {
 */
 class feature final {
    sstring _name;
-    bool _enabled;
+    bool _enabled = false;
+    mutable shared_promise<> _pr;
    friend class gossiper;
 public:
    explicit feature(sstring name, bool enabled = false);
+    feature() = default;
    ~feature();
-    feature()
-            : _enabled(false)
-    { }
-    feature(const feature& other)
-            : feature(other._name, other._enabled)
-    { }
+    feature(const feature& other) = delete;
    void enable();
-    feature& operator=(feature other);
+    feature& operator=(feature&& other);
    const sstring& name() const {
        return _name;
    }
@@ -53,6 +52,7 @@ public:
    friend inline std::ostream& operator<<(std::ostream& os, const feature& f) {
        return os << "{ gossip feature = " << f._name << " }";
    }
+    future<> when_enabled() const { return _pr.get_shared_future(); }
 };

 } // namespace gms
--- a/gms/gossip_digest_ack.hh
+++ b/gms/gossip_digest_ack.hh
@@ -68,7 +68,11 @@ public:
        return _digests;
    }

-    std::map<inet_address, endpoint_state> get_endpoint_state_map() const {
+    std::map<inet_address, endpoint_state>& get_endpoint_state_map() {
+        return _map;
+    }
+
+    const std::map<inet_address, endpoint_state>& get_endpoint_state_map() const {
        return _map;
    }

--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -59,6 +59,7 @@
 #include <chrono>
 #include "dht/i_partitioner.hh"
 #include <boost/range/algorithm/set_algorithm.hpp>
+#include <boost/range/adaptors.hpp>

 namespace gms {

@@ -222,13 +223,13 @@ future<> gossiper::handle_ack_msg(msg_addr id, gossip_digest_ack ack_msg) {
    }

    auto g_digest_list = ack_msg.get_gossip_digest_list();
-    auto ep_state_map = ack_msg.get_endpoint_state_map();
+    auto& ep_state_map = ack_msg.get_endpoint_state_map();

    auto f = make_ready_future<>();
    if (ep_state_map.size() > 0) {
        /* Notify the Failure Detector */
        this->notify_failure_detector(ep_state_map);
-        f = this->apply_state_locally(ep_state_map);
+        f = this->apply_state_locally(std::move(ep_state_map));
    }

    return f.then([id, g_digest_list = std::move(g_digest_list), this] {
@@ -268,7 +269,7 @@ future<> gossiper::handle_ack2_msg(gossip_digest_ack2 msg) {
    auto& remote_ep_state_map = msg.get_endpoint_state_map();
    /* Notify the Failure Detector */
    notify_failure_detector(remote_ep_state_map);
-    return apply_state_locally(remote_ep_state_map);
+    return apply_state_locally(std::move(remote_ep_state_map));
 }

 future<> gossiper::handle_echo_msg() {
@@ -370,7 +371,7 @@ future<> gossiper::send_gossip(gossip_digest_syn message, std::set<inet_address>
 }


-void gossiper::notify_failure_detector(inet_address endpoint, endpoint_state remote_endpoint_state) {
+void gossiper::notify_failure_detector(inet_address endpoint, const endpoint_state& remote_endpoint_state) {
    /*
     * If the local endpoint state exists then report to the FD only
     * if the versions workout.
@@ -405,59 +406,65 @@ void gossiper::notify_failure_detector(inet_address endpoint, endpoint_state rem
    }
 }

-future<> gossiper::apply_state_locally(const std::map<inet_address, endpoint_state>& map) {
-    return seastar::async([this, g = this->shared_from_this(), map] () mutable {
-        for (auto& entry : map) {
-            const auto& ep = entry.first;
-            if (ep == get_broadcast_address() && !is_in_shadow_round()) {
-                continue;
-            }
-            if (_just_removed_endpoints.count(ep)) {
-                logger.trace("Ignoring gossip for {} because it is quarantined", ep);
-                continue;
-            }
-            /*
-               If state does not exist just add it. If it does then add it if the remote generation is greater.
-               If there is a generation tie, attempt to break it by heartbeat version.
-               */
-            const endpoint_state& remote_state = entry.second;
-            auto it = endpoint_state_map.find(ep);
-            if (it != endpoint_state_map.end()) {
-                endpoint_state& local_ep_state_ptr = it->second;
-                int local_generation = local_ep_state_ptr.get_heart_beat_state().get_generation();
-                int remote_generation = remote_state.get_heart_beat_state().get_generation();
-                logger.trace("{} local generation {}, remote generation {}", ep, local_generation, remote_generation);
-                // }
-                if (local_generation != 0 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
-                    // assume some peer has corrupted memory and is broadcasting an unbelievable generation about another peer (or itself)
-                    logger.warn("received an invalid gossip generation for peer {}; local generation = {}, received generation = {}",
-                        ep, local_generation, remote_generation);
-                } else if (remote_generation > local_generation) {
-                    logger.trace("Updating heartbeat state generation to {} from {} for {}", remote_generation, local_generation, ep);
-                    // major state change will handle the update by inserting the remote state directly
-                    handle_major_state_change(ep, remote_state);
-                } else if (remote_generation == local_generation) {  //generation has not changed, apply new states
-                    /* find maximum state */
-                    int local_max_version = get_max_endpoint_state_version(local_ep_state_ptr);
-                    int remote_max_version = get_max_endpoint_state_version(remote_state);
-                    if (remote_max_version > local_max_version) {
-                        // apply states, but do not notify since there is no major change
-                        apply_new_states(ep, local_ep_state_ptr, remote_state);
+future<> gossiper::apply_state_locally(std::map<inet_address, endpoint_state> map) {
+    return seastar::with_semaphore(_apply_state_locally_semaphore, 1, [this, g = this->shared_from_this(), map = std::move(map)] {
+        return seastar::async([this, g, map = std::move(map)] () mutable {
+            auto endpoints = boost::copy_range<std::vector<inet_address>>(map | boost::adaptors::map_keys);
+            std::shuffle(endpoints.begin(), endpoints.end(), _random_engine);
+            auto node_is_seed = [this] (gms::inet_address ip) { return is_seed(ip); };
+            boost::partition(endpoints, node_is_seed);
+            logger.debug("apply_state_locally_endpoints={}", endpoints);
+
+            for (auto& ep : endpoints) {
+                if (ep == get_broadcast_address() && !is_in_shadow_round()) {
+                    continue;
+                }
+                if (_just_removed_endpoints.count(ep)) {
+                    logger.trace("Ignoring gossip for {} because it is quarantined", ep);
+                    continue;
+                }
+                /*
+                   If state does not exist just add it. If it does then add it if the remote generation is greater.
+                   If there is a generation tie, attempt to break it by heartbeat version.
+                   */
+                const endpoint_state& remote_state = map[ep];
+                auto it = endpoint_state_map.find(ep);
+                if (it != endpoint_state_map.end()) {
+                    endpoint_state& local_ep_state_ptr = it->second;
+                    int local_generation = local_ep_state_ptr.get_heart_beat_state().get_generation();
+                    int remote_generation = remote_state.get_heart_beat_state().get_generation();
+                    logger.trace("{} local generation {}, remote generation {}", ep, local_generation, remote_generation);
+                    if (local_generation != 0 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
+                        // assume some peer has corrupted memory and is broadcasting an unbelievable generation about another peer (or itself)
+                        logger.warn("received an invalid gossip generation for peer {}; local generation = {}, received generation = {}",
+                            ep, local_generation, remote_generation);
+                    } else if (remote_generation > local_generation) {
+                        logger.trace("Updating heartbeat state generation to {} from {} for {}", remote_generation, local_generation, ep);
+                        // major state change will handle the update by inserting the remote state directly
+                        handle_major_state_change(ep, remote_state);
+                    } else if (remote_generation == local_generation) {  //generation has not changed, apply new states
+                        /* find maximum state */
+                        int local_max_version = get_max_endpoint_state_version(local_ep_state_ptr);
+                        int remote_max_version = get_max_endpoint_state_version(remote_state);
+                        if (remote_max_version > local_max_version) {
+                            // apply states, but do not notify since there is no major change
+                            apply_new_states(ep, local_ep_state_ptr, remote_state);
+                        } else {
+                            logger.trace("Ignoring remote version {} <= {} for {}", remote_max_version, local_max_version, ep);
+                        }
+                        if (!local_ep_state_ptr.is_alive() && !is_dead_state(local_ep_state_ptr)) { // unless of course, it was dead
+                            mark_alive(ep, local_ep_state_ptr);
+                        }
                    } else {
-                        logger.trace("Ignoring remote version {} <= {} for {}", remote_max_version, local_max_version, ep);
-                    }
-                    if (!local_ep_state_ptr.is_alive() && !is_dead_state(local_ep_state_ptr)) { // unless of course, it was dead
-                        mark_alive(ep, local_ep_state_ptr);
+                        logger.trace("Ignoring remote generation {} < {}", remote_generation, local_generation);
                    }
                } else {
-                    logger.trace("Ignoring remote generation {} < {}", remote_generation, local_generation);
+                    // this is a new node, report it to the FD in case it is the first time we are seeing it AND it's not alive
+                    get_local_failure_detector().report(ep);
+                    handle_major_state_change(ep, remote_state);
                }
-            } else {
-                // this is a new node, report it to the FD in case it is the first time we are seeing it AND it's not alive
-                get_local_failure_detector().report(ep);
-                handle_major_state_change(ep, remote_state);
            }
-        }
+        });
    });
 }

@@ -590,8 +597,11 @@ void gossiper::run() {
                /* Gossip to some random live members */
                // TODO: For now, we choose 10th of all the nodes in the cluster.
                auto nr_live_nodes = std::max(size_t(1), endpoint_state_map.size() / 10);
+                nr_live_nodes = std::min(nr_live_nodes, _live_endpoints.size());
                std::unordered_set<gms::inet_address> live_nodes;
-                while (live_nodes.size() < nr_live_nodes && !_live_endpoints.empty()) {
+                logger.debug("nr_live_nodes={}, endpoint_state_map.size()={}, live_endpoints.size={}",
+                    nr_live_nodes, endpoint_state_map.size(), _live_endpoints.size());
+                while (live_nodes.size() < nr_live_nodes && nr_live_nodes <= _live_endpoints.size()) {
                    if (!_live_endpoints_just_added.empty()) {
                        auto ep = _live_endpoints_just_added.front();
                        _live_endpoints_just_added.pop_front();
@@ -723,6 +733,10 @@ bool gossiper::seen_any_seed() {
    return false;
 }

+bool gossiper::is_seed(const gms::inet_address& endpoint) const {
+    return _seeds.count(endpoint);
+}
+
 void gossiper::register_(shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
    _subscribers.push_back(subscriber);
 }
@@ -1149,7 +1163,7 @@ int gossiper::compare_endpoint_startup(inet_address addr1, inet_address addr2) {
    return ep1->get_heart_beat_state().get_generation() - ep2->get_heart_beat_state().get_generation();
 }

-void gossiper::notify_failure_detector(std::map<inet_address, endpoint_state> remoteEpStateMap) {
+void gossiper::notify_failure_detector(const std::map<inet_address, endpoint_state>& remoteEpStateMap) {
    for (auto& entry : remoteEpStateMap) {
        notify_failure_detector(entry.first, entry.second);
    }
@@ -1174,26 +1188,27 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
    local_state.mark_dead();
    msg_addr id = get_msg_addr(addr);
    logger.trace("Sending a EchoMessage to {}", id);
-    try {
-        ms().send_gossip_echo(id).get();
+    ms().send_gossip_echo(id).then([this, addr] {
        logger.trace("Got EchoMessage Reply");
        set_last_processed_message_at();
-        // After sending echo message, the Node might not be in the
-        // endpoint_state_map anymore, use the reference of local_state
-        // might cause user-after-free
-        auto it = endpoint_state_map.find(addr);
-        if (it == endpoint_state_map.end()) {
-            logger.info("Node {} is not in endpoint_state_map anymore", addr);
-        } else {
-            endpoint_state& state = it->second;
-            logger.debug("Mark Node {} alive after EchoMessage", addr);
-            real_mark_alive(addr, state);
-        }
-    } catch(...) {
-        logger.warn("Fail to send EchoMessage to {}: {}", id, std::current_exception());
-    }
-
-    _pending_mark_alive_endpoints.erase(addr);
+        return seastar::async([this, addr] {
+            // After sending echo message, the Node might not be in the
+            // endpoint_state_map anymore, use the reference of local_state
+            // might cause user-after-free
+            auto it = endpoint_state_map.find(addr);
+            if (it == endpoint_state_map.end()) {
+                logger.info("Node {} is not in endpoint_state_map anymore", addr);
+            } else {
+                endpoint_state& state = it->second;
+                logger.debug("Mark Node {} alive after EchoMessage", addr);
+                real_mark_alive(addr, state);
+            }
+        });
+    }).finally([this, addr] {
+        _pending_mark_alive_endpoints.erase(addr);
+    }).handle_exception([addr] (auto ep) {
+        logger.warn("Fail to send EchoMessage to {}: {}", addr, ep);
+    });
 }

 // Runs inside seastar::async context
@@ -1300,6 +1315,10 @@ bool gossiper::is_shutdown(const inet_address& endpoint) const {
    return get_gossip_status(endpoint) == sstring(versioned_value::SHUTDOWN);
 }

+bool gossiper::is_normal(const inet_address& endpoint) const {
+    return get_gossip_status(endpoint) == sstring(versioned_value::STATUS_NORMAL);
+}
+
 bool gossiper::is_silent_shutdown_state(const endpoint_state& ep_state) const{
    sstring state = get_gossip_status(ep_state);
    for (auto& deadstate : SILENT_SHUTDOWN_STATES) {
@@ -1968,6 +1987,8 @@ feature::feature(sstring name, bool enabled)
        , _enabled(enabled) {
    if (!_enabled) {
        get_local_gossiper().register_feature(this);
+    } else {
+        _pr.set_value();
    }
 }

@@ -1980,12 +2001,13 @@ feature::~feature() {
    }
 }

-feature& feature::operator=(feature other) {
+feature& feature::operator=(feature&& other) {
    if (!_enabled) {
        get_local_gossiper().unregister_feature(this);
    }
    _name = other._name;
    _enabled = other._enabled;
+    _pr = std::move(other._pr);
    if (!_enabled) {
        get_local_gossiper().register_feature(this);
    }
@@ -1996,7 +2018,10 @@ void feature::enable() {
    if (engine().cpu_id() == 0) {
        logger.info("Feature {} is enabled", name());
    }
-    _enabled = true;
+    if (!_enabled) {
+        _enabled = true;
+        _pr.set_value();
+    }
 }

 } // namespace gms
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -105,6 +105,7 @@ private:
    std::set<inet_address> _seeds_from_config;
    sstring _cluster_name;
    semaphore _callback_running{1};
+    semaphore _apply_state_locally_semaphore{1};
 public:
    future<> timer_callback_lock() { return _callback_running.wait(); }
    void timer_callback_unlock() { _callback_running.signal(); }
@@ -404,10 +405,10 @@ public:
     */
    int compare_endpoint_startup(inet_address addr1, inet_address addr2);

-    void notify_failure_detector(std::map<inet_address, endpoint_state> remoteEpStateMap);
+    void notify_failure_detector(const std::map<inet_address, endpoint_state>& remoteEpStateMap);


-    void notify_failure_detector(inet_address endpoint, endpoint_state remote_endpoint_state);
+    void notify_failure_detector(inet_address endpoint, const endpoint_state& remote_endpoint_state);

 private:
    void mark_alive(inet_address addr, endpoint_state& local_state);
@@ -428,7 +429,7 @@ public:
    bool is_alive(inet_address ep);
    bool is_dead_state(const endpoint_state& eps) const;

-    future<> apply_state_locally(const std::map<inet_address, endpoint_state>& map);
+    future<> apply_state_locally(std::map<inet_address, endpoint_state> map);

 private:
    void apply_new_states(inet_address addr, endpoint_state& local_state, const endpoint_state& remote_state);
@@ -504,7 +505,9 @@ public:
    void dump_endpoint_state_map();
    void debug_show();
 public:
+    bool is_seed(const inet_address& endpoint) const;
    bool is_shutdown(const inet_address& endpoint) const;
+    bool is_normal(const inet_address& endpoint) const;
    bool is_silent_shutdown_state(const endpoint_state& ep_state) const;
    void mark_as_shutdown(const inet_address& endpoint);
    void force_newer_generation();
--- a/idl/frozen_schema.idl.hh
+++ b/idl/frozen_schema.idl.hh
@@ -27,8 +27,9 @@ class schema_mutations {
    canonical_mutation columnfamilies_canonical_mutation();
    canonical_mutation columns_canonical_mutation();
    bool is_view()[[version 1.6]];
-    std::experimental::optional<canonical_mutation> indices_canonical_mutation()[[version 1.9]];
-    std::experimental::optional<canonical_mutation> dropped_columns_canonical_mutation()[[version 1.9]];
+    std::experimental::optional<canonical_mutation> indices_canonical_mutation()[[version 2.0]];
+    std::experimental::optional<canonical_mutation> dropped_columns_canonical_mutation()[[version 2.0]];
+    std::experimental::optional<canonical_mutation> scylla_tables_canonical_mutation()[[version 2.0]];
 };

 class schema stub [[writable]] {
--- a/keys.hh
+++ b/keys.hh
@@ -182,6 +182,9 @@ public:
    static TopLevel from_exploded(const schema& s, const std::vector<bytes>& v) {
        return from_exploded(v);
    }
+    static TopLevel from_exploded_view(const std::vector<bytes_view>& v) {
+        return from_exploded(v);
+    }

    // We don't allow optional values, but provide this method as an efficient adaptor
    static TopLevel from_optional_exploded(const schema& s, const std::vector<bytes_opt>& v) {
--- a/main.cc
+++ b/main.cc
@@ -59,6 +59,8 @@ thread_local disk_error_signal_type commit_error;
 thread_local disk_error_signal_type general_disk_error;
 seastar::metrics::metric_groups app_metrics;

+using namespace std::chrono_literals;
+
 namespace bpo = boost::program_options;

 static boost::filesystem::path relative_conf_dir(boost::filesystem::path path) {
@@ -277,7 +279,10 @@ int main(int ac, char** av) {
    }
    runtime::init_uptime();
    std::setvbuf(stdout, nullptr, _IOLBF, 1000);
-    app_template app;
+    app_template::config app_cfg;
+    app_cfg.name = "Scylla";
+    app_cfg.default_task_quota = 500us;
+    app_template app(std::move(app_cfg));
    auto opt_add = app.add_options();

    auto cfg = make_lw_shared<db::config>();
@@ -529,12 +534,12 @@ int main(int ac, char** av) {
            db::get_batchlog_manager().start(std::ref(qp)).get();
            // #293 - do not stop anything
            // engine().at_exit([] { return db::get_batchlog_manager().stop(); });
-            sstables::init_metrics();
+            sstables::init_metrics().get();

            db::system_keyspace::minimal_setup(db, qp);

            // schema migration, if needed, is also done on shard 0
-            db::legacy_schema_migrator::migrate(qp.local()).get();
+            db::legacy_schema_migrator::migrate(proxy, qp.local()).get();

            supervisor::notify("loading sstables");

@@ -625,13 +630,13 @@ int main(int ac, char** av) {
            lb->start_broadcasting();
            service::get_local_storage_service().set_load_broadcaster(lb);
            engine().at_exit([lb = std::move(lb)] () mutable { return lb->stop_broadcasting(); });
+            supervisor::notify("starting cf cache hit rate calculator");
            cf_cache_hitrate_calculator.start(std::ref(db), std::ref(cf_cache_hitrate_calculator)).get();
            engine().at_exit([&cf_cache_hitrate_calculator] { return cf_cache_hitrate_calculator.stop(); });
            cf_cache_hitrate_calculator.local().run_on(engine().cpu_id());
-            supervisor::notify("starting native transport");
-            gms::get_local_gossiper().wait_for_gossip_to_settle();
+            gms::get_local_gossiper().wait_for_gossip_to_settle().get();
            api::set_server_gossip_settle(ctx).get();
-            supervisor::notify("starting cf cache hit rate calculator");
+            supervisor::notify("starting native transport");
            service::get_local_storage_service().start_native_transport().get();
            if (start_thrift) {
                service::get_local_storage_service().start_rpc_server().get();
--- a/memtable-sstable.hh
+++ b/memtable-sstable.hh
@@ -29,11 +29,13 @@
 #include "sstables/sstables.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/file.hh>
+#include <seastar/core/thread.hh>

 future<>
 write_memtable_to_sstable(memtable& mt,
        sstables::shared_sstable sst,
        bool backup = false,
        const io_priority_class& pc = default_priority_class(),
-        bool leave_unsealed = false);
+        bool leave_unsealed = false,
+        seastar::thread_scheduling_group* tsg = nullptr);

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -835,7 +835,7 @@ future<> messaging_service::send_definitions_update(msg_addr id, std::vector<fro
    return send_message_oneway(this, messaging_verb::DEFINITIONS_UPDATE, std::move(id), std::move(fm));
 }

-void messaging_service::register_migration_request(std::function<future<std::vector<frozen_mutation>> ()>&& func) {
+void messaging_service::register_migration_request(std::function<future<std::vector<frozen_mutation>> (const rpc::client_info&)>&& func) {
    register_handler(this, netw::messaging_verb::MIGRATION_REQUEST, std::move(func));
 }
 void messaging_service::unregister_migration_request() {
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -288,7 +288,7 @@ public:
    future<> send_definitions_update(msg_addr id, std::vector<frozen_mutation> fm);

    // Wrapper for MIGRATION_REQUEST
-    void register_migration_request(std::function<future<std::vector<frozen_mutation>> ()>&& func);
+    void register_migration_request(std::function<future<std::vector<frozen_mutation>> (const rpc::client_info&)>&& func);
    void unregister_migration_request();
    future<std::vector<frozen_mutation>> send_migration_request(msg_addr id);

--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -932,15 +932,6 @@ rows_entry::equal(const schema& s, const rows_entry& other) const {
    return equal(s, other, s);
 }

-position_in_partition_view rows_entry::position() const {
-    if (_flags._last) {
-        return position_in_partition_view::after_all_clustered_rows();
-    } else {
-        return position_in_partition_view(
-            position_in_partition_view::clustering_row_tag_t(), _key);
-    }
-}
-
 bool
 rows_entry::equal(const schema& s, const rows_entry& other, const schema& other_schema) const {
    position_in_partition::equal_compare eq(s);
@@ -2119,7 +2110,7 @@ public:

 mutation_partition::mutation_partition(mutation_partition::incomplete_tag, const schema& s, tombstone t)
    : _tombstone(t)
-    , _static_row_continuous(false)
+    , _static_row_continuous(!s.has_static_columns())
    , _rows()
    , _row_tombstones(s)
 {
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -712,7 +712,15 @@ public:
    const deletable_row& row() const {
        return _row;
    }
-    position_in_partition_view position() const;
+    position_in_partition_view position() const {
+        if (_flags._last) {
+            return position_in_partition_view::after_all_clustered_rows();
+        } else {
+            return position_in_partition_view(
+                    position_in_partition_view::clustering_row_tag_t(), _key);
+        }
+    }
+
    is_continuous continuous() const { return is_continuous(_flags._continuous); }
    void set_continuous(bool value) { _flags._continuous = value; }
    void set_continuous(is_continuous value) { set_continuous(bool(value)); }
--- a/mutation_partition_serializer.cc
+++ b/mutation_partition_serializer.cc
@@ -62,8 +62,14 @@ auto write_counter_cell(Writer&& writer, atomic_cell_view c)
            counter_cell_view ccv(c);
            auto shards = std::move(value).start_value_counter_cell_full()
                                          .start_shards();
-            for (auto csv : ccv.shards()) {
-                shards.add_shards(counter_shard(csv));
+            if (service::get_local_storage_service().cluster_supports_correct_counter_order()) {
+                for (auto csv : ccv.shards()) {
+                    shards.add_shards(counter_shard(csv));
+                }
+            } else {
+                for (auto& cs : ccv.shards_compatible_with_1_7_4()) {
+                    shards.add_shards(cs);
+                }
            }
            return std::move(shards).end_shards().end_counter_cell_full();
        }
--- a/mutation_partition_view.cc
+++ b/mutation_partition_view.cc
@@ -73,8 +73,9 @@ atomic_cell read_atomic_cell(atomic_cell_variant cv)
                    // TODO: a lot of copying for something called view
                    counter_cell_builder ccb; // we know the final number of shards
                    for (auto csv : ccv.shards()) {
-                        ccb.add_shard(counter_shard(csv));
+                        ccb.add_maybe_unsorted_shard(counter_shard(csv));
                    }
+                    ccb.sort_and_remove_duplicates();
                    return ccb.build(_created_at);
                }
                atomic_cell operator()(ser::counter_cell_update_view& ccv) const {
--- a/partition_slice_builder.cc
+++ b/partition_slice_builder.cc
@@ -105,7 +105,7 @@ partition_slice_builder::with_regular_column(bytes name) {
        throw std::runtime_error(sprint("No such column: %s", _schema.regular_column_name_type()->to_string(name)));
    }
    if (!def->is_regular()) {
-        throw std::runtime_error(sprint("Column is not regular: %s", _schema.regular_column_name_type()->to_string(name)));
+        throw std::runtime_error(sprint("Column is not regular: %s", _schema.column_name_type(*def)->to_string(name)));
    }
    _regular_columns->push_back(def->id);
    return *this;
--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -41,9 +41,17 @@ inline void maybe_merge_versions(lw_shared_ptr<partition_snapshot>& snp,
    with_allocator(lsa_region.allocator(), [&snp, &lsa_region, &read_section] {
        return with_linearized_managed_bytes([&snp, &lsa_region, &read_section] {
            try {
-                read_section(lsa_region, [&snp] {
-                    snp->merge_partition_versions();
-                });
+                // Allocating sections require the region to be reclaimable
+                // which means that they cannot be nested.
+                // It is, however, possible, that if the snapshot is taken
+                // inside an allocating section and then an exception is thrown
+                // this function will be called to clean up even though we
+                // still will be in the context of the allocating section.
+                if (lsa_region.reclaiming_enabled()) {
+                    read_section(lsa_region, [&snp] {
+                        snp->merge_partition_versions();
+                    });
+                }
            } catch (...) { }
            snp = {};
        });
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -34,6 +34,8 @@
 // When the cursor is invalidated, it still maintains its previous position. It can be brought
 // back to validity by calling maybe_refresh(), or advance_to().
 //
+// Insertion of row entries after cursor's position invalidates the cursor.
+//
 class partition_snapshot_row_cursor final {
    struct position_in_version {
        mutation_partition::rows_type::iterator it;
@@ -55,6 +57,7 @@ class partition_snapshot_row_cursor final {
    logalloc::region& _region;
    partition_snapshot& _snp;
    std::vector<position_in_version> _heap;
+    std::vector<mutation_partition::rows_type::iterator> _iterators;
    std::vector<position_in_version> _current_row;
    position_in_partition _position;
    uint64_t _last_reclaim_count = 0;
@@ -78,13 +81,16 @@ public:
        , _snp(snp)
        , _position(position_in_partition::static_row_tag_t{})
    { }
-    bool has_up_to_date_row_from_latest_version() const {
-        return up_to_date() && _current_row[0].version_no == 0;
+    bool has_valid_row_from_latest_version() const {
+        return iterators_valid() && _current_row[0].version_no == 0;
    }
    mutation_partition::rows_type::iterator get_iterator_in_latest_version() const {
-        return _current_row[0].it;
+        return _iterators[0];
    }
-    bool up_to_date() const {
+
+    // Returns true iff the iterators obtained since the cursor was last made valid
+    // are still valid. Note that this doesn't mean that the cursor itself is valid.
+    bool iterators_valid() const {
        return _region.reclaim_counter() == _last_reclaim_count && _last_versions_count == _snp.version_count();
    }

@@ -97,9 +103,40 @@ public:
    //
    // but avoids work if not necessary.
    bool maybe_refresh() {
-        if (!up_to_date()) {
+        if (!iterators_valid()) {
            return advance_to(_position);
        }
+        // Refresh latest version's iterator in case there was an insertion
+        // before it and after cursor's position. There cannot be any
+        // insertions for non-latest versions, so we don't have to update them.
+        if (_current_row[0].version_no != 0) {
+            rows_entry::compare less(_schema);
+            position_in_partition::equal_compare eq(_schema);
+            position_in_version::less_compare heap_less(_schema);
+            auto& rows = _snp.version()->partition().clustered_rows();
+            auto it = _iterators[0] = rows.lower_bound(_position, less);
+            auto heap_i = boost::find_if(_heap, [](auto&& v) { return v.version_no == 0; });
+            if (it == rows.end()) {
+                if (heap_i != _heap.end()) {
+                    _heap.erase(heap_i);
+                    boost::range::make_heap(_heap, heap_less);
+                }
+            } else if (eq(_position, it->position())) {
+                _current_row.insert(_current_row.begin(), position_in_version{it, rows.end(), 0});
+                if (heap_i != _heap.end()) {
+                    _heap.erase(heap_i);
+                    boost::range::make_heap(_heap, heap_less);
+                }
+            } else {
+                if (heap_i != _heap.end()) {
+                    heap_i->it = it;
+                    boost::range::make_heap(_heap, heap_less);
+                } else {
+                    _heap.push_back({it, rows.end(), 0});
+                    boost::range::push_heap(_heap, heap_less);
+                }
+            }
+        }
        return true;
    }

@@ -119,11 +156,13 @@ public:
        position_in_version::less_compare heap_less(_schema);
        _heap.clear();
        _current_row.clear();
+        _iterators.clear();
        int version_no = 0;
        for (auto&& v : _snp.versions()) {
            auto& rows = v.partition().clustered_rows();
            auto pos = rows.lower_bound(lower_bound, less);
            auto end = rows.end();
+            _iterators.push_back(pos);
            if (pos != end) {
                _heap.push_back({pos, end, version_no});
            }
@@ -142,9 +181,10 @@ public:
    // Can be only called on a valid cursor pointing at a row.
    bool next() {
        position_in_version::less_compare heap_less(_schema);
-        assert(up_to_date());
+        assert(iterators_valid());
        for (auto&& curr : _current_row) {
            ++curr.it;
+            _iterators[curr.version_no] = curr.it;
            if (curr.it != curr.end) {
                _heap.push_back(curr);
                boost::range::push_heap(_heap, heap_less);
@@ -168,12 +208,14 @@ public:
    const clustering_key& key() const { return _current_row[0].it->key(); }

    // Can be called only when cursor is valid and pointing at a row.
-    clustering_row row() const {
-        clustering_row result(key());
-        for (auto&& v : _current_row) {
-            result.apply(_schema, *v.it);
+    mutation_fragment row() const {
+        auto it = _current_row.begin();
+        auto mf = mutation_fragment(clustering_row(*it->it));
+        auto& cr = mf.as_mutable_clustering_row();
+        for (++it; it != _current_row.end(); ++it) {
+            cr.apply(_schema, *it->it);
        }
-        return result;
+        return mf;
    }

    // Can be called when cursor is pointing at a row, even when invalid.
@@ -184,6 +226,32 @@ public:
    bool is_in_latest_version() const;
    bool previous_row_in_latest_version_has_key(const clustering_key_prefix& key) const;
    void set_continuous(bool val);
+
+    friend std::ostream& operator<<(std::ostream& out, const partition_snapshot_row_cursor& cur) {
+        out << "{cursor: position=" << cur._position << ", ";
+        if (!cur.iterators_valid()) {
+            return out << " iterators invalid}";
+        }
+        out << "current=[";
+        bool first = true;
+        for (auto&& v : cur._current_row) {
+            if (!first) {
+                out << ", ";
+            }
+            first = false;
+            out << v.version_no;
+        }
+        out << "], heap=[";
+        first = true;
+        for (auto&& v : cur._heap) {
+            if (!first) {
+                out << ", ";
+            }
+            first = false;
+            out << "{v=" << v.version_no << ", pos=" << v.it->position() << "}";
+        }
+        return out << "]}";
+    };
 };

 inline
@@ -198,8 +266,8 @@ bool partition_snapshot_row_cursor::previous_row_in_latest_version_has_key(const
    }
    auto prev_it = _current_row[0].it;
    --prev_it;
-    clustering_key_prefix::tri_compare tri_comp(_schema);
-    return tri_comp(prev_it->key(), key) == 0;
+    clustering_key_prefix::equality eq(_schema);
+    return eq(prev_it->key(), key);
 }

 inline
--- a/range.hh
+++ b/range.hh
@@ -352,10 +352,10 @@ public:
            return *this;
        }
    }
-    template<typename Transformer, typename U = typename std::result_of<Transformer(T)>::type>
-    static stdx::optional<typename wrapping_range<U>::bound> transform_bound(optional<bound> b, Transformer&& transformer) {
+    template<typename Bound, typename Transformer, typename U = typename std::result_of<Transformer(T)>::type>
+    static stdx::optional<typename wrapping_range<U>::bound> transform_bound(Bound&& b, Transformer&& transformer) {
        if (b) {
-            return { { transformer(std::move(*b).value()), b->is_inclusive() } };
+            return { { transformer(std::forward<Bound>(b).value().value()), b->is_inclusive() } };
        };
        return {};
    }
--- a/read_context.hh
+++ b/read_context.hh
@@ -71,7 +71,11 @@ public:
                _range = std::move(*new_range);
                _last_key = {};
            }
+            if (_reader) {
+                ++_cache._tracker._stats.underlying_recreations;
+            }
            auto& snap = _cache.snapshot_for_phase(phase);
+            _reader = {}; // See issue #2644
            _reader = _cache.create_underlying_reader(_read_context, snap, _range);
            _reader_creation_phase = phase;
        }
@@ -90,8 +94,14 @@ public:
        _range = std::move(range);
        _last_key = { };
        _new_last_key = { };
-        if (_reader && _reader_creation_phase == phase) {
-            return _reader->fast_forward_to(_range);
+        if (_reader) {
+            if (_reader_creation_phase == phase) {
+                ++_cache._tracker._stats.underlying_partition_skips;
+                return _reader->fast_forward_to(_range);
+            } else {
+                ++_cache._tracker._stats.underlying_recreations;
+                _reader = {}; // See issue #2644
+            }
        }
        _reader = _cache.create_underlying_reader(_read_context, snapshot, _range);
        _reader_creation_phase = phase;
@@ -121,6 +131,7 @@ class read_context final : public enable_lw_shared_from_this<read_context> {
    mutation_reader::forwarding _fwd_mr;
    bool _range_query;
    autoupdating_underlying_reader _underlying;
+    uint64_t _underlying_created = 0;

    // When reader enters a partition, it must be set up for reading that
    // partition from the underlying mutation source (_sm) in one of two ways:
@@ -155,7 +166,18 @@ public:
        , _fwd_mr(fwd_mr)
        , _range_query(!range.is_singular() || !range.start()->value().has_key())
        , _underlying(_cache, *this)
-    { }
+    {
+        ++_cache._tracker._stats.reads;
+    }
+    ~read_context() {
+        ++_cache._tracker._stats.reads_done;
+        if (_underlying_created) {
+            _cache._stats.reads_with_misses.mark();
+            ++_cache._tracker._stats.reads_with_misses;
+        } else {
+            _cache._stats.reads_with_no_misses.mark();
+        }
+    }
    read_context(const read_context&) = delete;
    row_cache& cache() { return _cache; }
    const schema_ptr& schema() const { return _schema; }
@@ -169,6 +191,7 @@ public:
    autoupdating_underlying_reader& underlying() { return _underlying; }
    row_cache::phase_type phase() const { return _phase; }
    const dht::decorated_key& key() const { return _sm->decorated_key(); }
+    void on_underlying_created() { ++_underlying_created; }
 private:
    future<> create_sm();
    future<> ensure_sm_created() {
@@ -198,9 +221,17 @@ public:
    // Fast forwards the underlying streamed_mutation to given range.
    future<> fast_forward_to(position_range range) {
        return ensure_sm_created().then([this, range = std::move(range)] () mutable {
+            ++_cache._tracker._stats.underlying_row_skips;
            return _sm->fast_forward_to(std::move(range));
        });
    }
+    // Returns the underlying streamed_mutation.
+    // The caller has to ensure that the streamed mutation was already created
+    // (e.g. the most recent call to enter_partition(const dht::decorated_key&, ...)
+    // was followed by a call to fast_forward_to()).
+    streamed_mutation& get_streamed_mutation() noexcept {
+        return *_sm;
+    }
    // Gets the next fragment from the underlying streamed_mutation
    future<mutation_fragment_opt> get_next_fragment() {
        return ensure_sm_created().then([this] {
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -41,11 +41,6 @@

 static logging::logger rlogger("repair");

-struct failed_range {
-    sstring cf;
-    ::dht::token_range range;
-};
-
 class repair_info {
 public:
    seastar::sharded<database>& db;
@@ -56,7 +51,7 @@ public:
    shard_id shard;
    std::vector<sstring> data_centers;
    std::vector<sstring> hosts;
-    std::vector<failed_range> failed_ranges;
+    size_t nr_failed_ranges = 0;
    // Map of peer -> <cf, ranges>
    std::unordered_map<gms::inet_address, std::unordered_map<sstring, dht::token_range_vector>> ranges_need_repair_in;
    std::unordered_map<gms::inet_address, std::unordered_map<sstring, dht::token_range_vector>> ranges_need_repair_out;
@@ -132,14 +127,11 @@ public:
        });
    }
    void check_failed_ranges() {
-        if (failed_ranges.empty()) {
-            rlogger.info("repair {} on shard {} completed successfully", id, shard);
+        if (nr_failed_ranges) {
+            rlogger.info("repair {} on shard {} failed - {} ranges failed", id, shard, nr_failed_ranges);
+            throw std::runtime_error(sprint("repair %d on shard %d failed to do checksum for %d sub ranges", id, shard, nr_failed_ranges));
        } else {
-            rlogger.info("repair {} on shard {} failed - {} ranges failed", id, shard, failed_ranges.size());
-            for (auto& frange: failed_ranges) {
-                rlogger.info("repair cf {} range {} failed", frange.cf, frange.range);
-            }
-            throw std::runtime_error(sprint("repair %d on shard %d failed to do checksum for %d sub ranges", id, shard, failed_ranges.size()));
+            rlogger.info("repair {} on shard {} completed successfully", id, shard);
        }
    }
    future<> request_transfer_ranges(const sstring& cf,
@@ -504,6 +496,19 @@ static future<partition_checksum> checksum_range_shard(database &db,
    });
 }

+// It is counter-productive to allow a large number of range checksum
+// operations to proceed in parallel (on the same shard), because the read
+// operation can already parallelize itself as much as needed, and doing
+// multiple reads in parallel just adds a lot of memory overheads.
+// So checksum_parallelism_semaphore is used to limit this parallelism,
+// and should be set to 1, or another small number.
+//
+// Note that checksumming_parallelism_semaphore applies not just in the
+// repair master, but also in the slave: The repair slave may receive many
+// checksum requests in parallel, but will only work on one or a few
+// (checksum_parallelism_semaphore) at once.
+static thread_local semaphore checksum_parallelism_semaphore(2);
+
 // Calculate the checksum of the data held on all shards of a column family,
 // in the given token range.
 // In practice, we only need to consider one or two shards which intersect the
@@ -526,7 +531,9 @@ future<partition_checksum> checksum_range(seastar::sharded<database> &db,
            auto& prs = shard_range.second;
            return db.invoke_on(shard, [keyspace, cf, prs = std::move(prs), hash_version] (database& db) mutable {
                return do_with(std::move(keyspace), std::move(cf), std::move(prs), [&db, hash_version] (auto& keyspace, auto& cf, auto& prs) {
-                    return checksum_range_shard(db, keyspace, cf, prs, hash_version);
+                    return seastar::with_semaphore(checksum_parallelism_semaphore, 1, [&db, hash_version, &keyspace, &cf, &prs] {
+                        return checksum_range_shard(db, keyspace, cf, prs, hash_version);
+                    });
                });
            }).then([&result] (partition_checksum sum) {
                result.add(sum);
@@ -537,14 +544,15 @@ future<partition_checksum> checksum_range(seastar::sharded<database> &db,
    });
 }

-// We don't need to wait for one checksum to finish before we start the
-// next, but doing too many of these operations in parallel also doesn't
-// make sense, so we limit the number of concurrent ongoing checksum
-// requests with a semaphore.
-//
-// FIXME: We shouldn't use a magic number here, but rather bind it to
-// some resource. Otherwise we'll be doing too little in some machines,
-// and too much in others.
+// parallelism_semaphore limits the number of parallel ongoing checksum
+// comparisons. This could mean, for example, that this number of checksum
+// requests have been sent to other nodes and we are waiting for them to
+// return so we can compare those to our own checksums. This limit can be
+// set fairly high because the outstanding comparisons take only few
+// resources. In particular, we do NOT do this number of file reads in
+// parallel because file reads have large memory overhads (read buffers,
+// partitions, etc.) - the number of concurrent reads is further limited
+// by an additional semaphore checksum_parallelism_semaphore (see above).
 //
 // FIXME: This would be better of in a repair service, or even a per-shard
 // repair instance holding all repair state. However, since we are anyway
@@ -576,7 +584,6 @@ static future<uint64_t> estimate_partitions(seastar::sharded<database>& db, cons
 static future<> repair_cf_range(repair_info& ri,
        sstring cf, ::dht::token_range range,
        const std::vector<gms::inet_address>& neighbors) {
-    ri.ranges_index++;
    if (neighbors.empty()) {
        // Nothing to do in this case...
        return make_ready_future<>();
@@ -584,8 +591,6 @@ static future<> repair_cf_range(repair_info& ri,

    return estimate_partitions(ri.db, ri.keyspace, cf, range).then([&ri, cf, range, &neighbors] (uint64_t estimated_partitions) {
    range_splitter ranges(range, estimated_partitions, ri.target_partitions);
-    rlogger.info("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, cf={}, range={}, target_partitions={}, estimated_partitions={}",
-            ri.ranges_index, ri.ranges.size(), ri.id, ri.shard, ri.keyspace, cf, range, ri.target_partitions, estimated_partitions);
    return do_with(seastar::gate(), true, std::move(cf), std::move(ranges),
        [&ri, &neighbors] (auto& completion, auto& success, const auto& cf, auto& ranges) {
        return do_until([&ranges] () { return !ranges.has_next(); },
@@ -626,7 +631,7 @@ static future<> repair_cf_range(repair_info& ri,
                                 utils::fb_utilities::get_broadcast_address()),
                                checksums[i].get_exception());
                            success = false;
-                            ri.failed_ranges.push_back(failed_range{cf, range});
+                            ri.nr_failed_ranges++;
                            // Do not break out of the loop here, so we can log
                            // (and discard) all the exceptions.
                        } else if (i > 0) {
@@ -751,7 +756,7 @@ static future<> repair_cf_range(repair_info& ri,
                    // any case, we need to remember that the repair failed to
                    // tell the caller.
                    success = false;
-                    ri.failed_ranges.push_back(failed_range{cf, range});
+                    ri.nr_failed_ranges++;
                    rlogger.warn("Failed sync of range {}: {}", range, eptr);
                }).finally([&completion] {
                    parallelism_semaphore.signal(1);
@@ -997,8 +1002,22 @@ static future<> repair_ranges(repair_info ri) {
        // repair all the ranges in sequence
        return do_for_each(ri.ranges, [&ri] (auto&& range) {
    #endif
-            check_in_shutdown();
-            return repair_range(ri, range);
+            ri.ranges_index++;
+            rlogger.info("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}",
+                ri.ranges_index, ri.ranges.size(), ri.id, ri.shard, ri.keyspace, ri.cfs, range);
+            return do_with(dht::selective_token_range_sharder(range, ri.shard), [&ri] (auto& sharder) {
+                return repeat([&ri, &sharder] () {
+                    check_in_shutdown();
+                    auto range_shard = sharder.next();
+                    if (range_shard) {
+                        return repair_range(ri, *range_shard).then([] {
+                            return make_ready_future<stop_iteration>(stop_iteration::no);
+                        });
+                    } else {
+                        return make_ready_future<stop_iteration>(stop_iteration::yes);
+                    }
+                });
+            });
        }).then([&ri] {
            // Do streaming for the remaining ranges we do not stream in
            // repair_cf_range
@@ -1013,27 +1032,6 @@ static future<> repair_ranges(repair_info ri) {
    });
 }

-static void split_and_add(std::vector<::dht::token_range>& ranges,
-        const dht::token_range& range) {
-    // The use of minimum_token() here twice is not a typo - because wrap-
-    // around token ranges are supported by midpoint(), the beyond-maximum
-    // token can also be represented by minimum_token().
-    auto midpoint = dht::global_partitioner().midpoint(
-            range.start() ? range.start()->value() : dht::minimum_token(),
-            range.end() ? range.end()->value() : dht::minimum_token());
-    // This shouldn't happen, but if the range included just one token, we
-    // can't split further (split() may actually fail with assertion failure)
-    if ((range.start() && midpoint == range.start()->value()) ||
-        (range.end() && midpoint == range.end()->value())) {
-        ranges.push_back(range);
-        return;
-    }
-    auto halves = range.split(midpoint, dht::token_comparator());
-    ranges.push_back(halves.first);
-    ranges.push_back(halves.second);
-}
-
-
 // repair_start() can run on any cpu; It runs on cpu0 the function
 // do_repair_start(). The benefit of always running that function on the same
 // CPU is that it allows us to keep some state (like a list of ongoing
@@ -1053,6 +1051,10 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
    rlogger.info("starting user-requested repair for keyspace {}, repair id {}, options {}", keyspace, id, options_map);
    repair_tracker.start(id);

+    if (!gms::get_local_gossiper().is_normal(utils::fb_utilities::get_broadcast_address())) {
+        throw std::runtime_error("Node is not in NORMAL status yet!");
+    }
+
    // If the "ranges" option is not explicitly specified, we repair all the
    // local ranges (the token ranges for which this node holds a replica of).
    // Each of these ranges may have a different set of replicas, so the
@@ -1125,35 +1127,12 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
        cfs = list_column_families(db.local(), keyspace);
    }

-    // Split the ranges so that we have more number of ranges than smp::count
-    // Note, the split is not a guaratnee when the range can not be split anmore.
-    dht::token_range_vector tosplit;
-    while (ranges.size() < smp::count) {
-        size_t sz = ranges.size();
-        tosplit.clear();
-        ranges.swap(tosplit);
-        for (const auto& range : tosplit) {
-            split_and_add(ranges, range);
-        }
-        if (sz == ranges.size()) {
-            // We can not split the ranges anymore
-            break;
-        }
-    }
-
-    std::map<shard_id, dht::token_range_vector> shard_ranges_map;
-    unsigned idx = 0;
-    for (auto& range : ranges) {
-        shard_ranges_map[idx++ % smp::count].push_back(std::move(range));
-    }

    std::vector<future<>> repair_results;
-    repair_results.reserve(shard_ranges_map.size());
+    repair_results.reserve(smp::count);

-    for (auto& x : shard_ranges_map) {
-        shard_id shard = x.first;
-        auto& ranges = x.second;
-        auto f = db.invoke_on(shard, [keyspace, cfs, id, ranges = std::move(ranges),
+    for (auto shard : boost::irange(unsigned(0), smp::count)) {
+        auto f = db.invoke_on(shard, [keyspace, cfs, id, ranges,
                data_centers = options.data_centers, hosts = options.hosts] (database& localdb) mutable {
            return repair_ranges(repair_info(service::get_local_storage_service().db(),
                    std::move(keyspace), std::move(ranges), std::move(cfs),
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -46,6 +46,7 @@ thread_local seastar::thread_scheduling_group row_cache::_update_thread_scheduli

 mutation_reader
 row_cache::create_underlying_reader(read_context& ctx, mutation_source& src, const dht::partition_range& pr) {
+    ctx.on_underlying_created();
    return src(_schema, pr, ctx.slice(), ctx.pc(), ctx.trace_state(), streamed_mutation::forwarding::yes);
 }

@@ -74,7 +75,7 @@ cache_tracker::cache_tracker() {
            }
            evict_last(_lru);
            --_stats.partitions;
-            ++_stats.evictions;
+            ++_stats.partition_evictions;
            ++_stats.modification_count;
            return memory::reclaiming_result::reclaimed_something;
           } catch (std::bad_alloc&) {
@@ -98,15 +99,24 @@ cache_tracker::setup_metrics() {
    _metrics.add_group("cache", {
        sm::make_gauge("bytes_used", sm::description("current bytes used by the cache out of the total size of memory"), [this] { return _region.occupancy().used_space(); }),
        sm::make_gauge("bytes_total", sm::description("total size of memory for the cache"), [this] { return _region.occupancy().total_space(); }),
-        sm::make_derive("total_operations_hits", sm::description("total number of operation hits"), _stats.hits),
-        sm::make_derive("total_operations_misses", sm::description("total number of operation misses"), _stats.misses),
-        sm::make_derive("total_operations_insertions", sm::description("total number of operation insert"), _stats.insertions),
-        sm::make_derive("total_operations_concurrent_misses_same_key", sm::description("total number of operation with misses same key"), _stats.concurrent_misses_same_key),
-        sm::make_derive("total_operations_merges", sm::description("total number of operation merged"), _stats.merges),
-        sm::make_derive("total_operations_evictions", sm::description("total number of operation eviction"), _stats.evictions),
-        sm::make_derive("total_operations_removals", sm::description("total number of operation removals"), _stats.removals),
-        sm::make_derive("total_operations_mispopulations", sm::description("number of entries not inserted by reads"), _stats.mispopulations),
-        sm::make_gauge("objects_partitions", sm::description("total number of partition objects"), _stats.partitions)
+        sm::make_derive("partition_hits", sm::description("number of partitions needed by reads and found in cache"), _stats.partition_hits),
+        sm::make_derive("partition_misses", sm::description("number of partitions needed by reads and missing in cache"), _stats.partition_misses),
+        sm::make_derive("partition_insertions", sm::description("total number of partitions added to cache"), _stats.partition_insertions),
+        sm::make_derive("row_hits", sm::description("total number of rows needed by reads and found in cache"), _stats.row_hits),
+        sm::make_derive("row_misses", sm::description("total number of rows needed by reads and missing in cache"), _stats.row_misses),
+        sm::make_derive("row_insertions", sm::description("total number of rows added to cache"), _stats.row_insertions),
+        sm::make_derive("concurrent_misses_same_key", sm::description("total number of operation with misses same key"), _stats.concurrent_misses_same_key),
+        sm::make_derive("partition_merges", sm::description("total number of partitions merged"), _stats.partition_merges),
+        sm::make_derive("partition_evictions", sm::description("total number of evicted partitions"), _stats.partition_evictions),
+        sm::make_derive("partition_removals", sm::description("total number of invalidated partitions"), _stats.partition_removals),
+        sm::make_derive("mispopulations", sm::description("number of entries not inserted by reads"), _stats.mispopulations),
+        sm::make_gauge("partitions", sm::description("total number of cached partitions"), _stats.partitions),
+        sm::make_derive("reads", sm::description("number of started reads"), _stats.reads),
+        sm::make_derive("reads_with_misses", sm::description("number of reads which had to read from sstables"), _stats.reads_with_misses),
+        sm::make_gauge("active_reads", sm::description("number of currently active reads"), [this] { return _stats.active_reads(); }),
+        sm::make_derive("sstable_reader_recreations", sm::description("number of times sstable reader was recreated due to memtable flush"), _stats.underlying_recreations),
+        sm::make_derive("sstable_partition_skips", sm::description("number of times sstable reader was fast forwarded across partitions"), _stats.underlying_partition_skips),
+        sm::make_derive("sstable_row_skips", sm::description("number of times sstable reader was fast forwarded within a partition"), _stats.underlying_row_skips),
    });
 }

@@ -127,7 +137,7 @@ void cache_tracker::clear() {
        };
        clear(_lru);
    });
-    _stats.removals += _stats.partitions;
+    _stats.partition_removals += _stats.partitions;
    _stats.partitions = 0;
    ++_stats.modification_count;
 }
@@ -141,7 +151,7 @@ void cache_tracker::touch(cache_entry& e) {
 }

 void cache_tracker::insert(cache_entry& entry) {
-    ++_stats.insertions;
+    ++_stats.partition_insertions;
    ++_stats.partitions;
    ++_stats.modification_count;
    _lru.push_front(entry);
@@ -149,20 +159,28 @@ void cache_tracker::insert(cache_entry& entry) {

 void cache_tracker::on_erase() {
    --_stats.partitions;
-    ++_stats.removals;
+    ++_stats.partition_removals;
    ++_stats.modification_count;
 }

 void cache_tracker::on_merge() {
-    ++_stats.merges;
+    ++_stats.partition_merges;
 }

-void cache_tracker::on_hit() {
-    ++_stats.hits;
+void cache_tracker::on_partition_hit() {
+    ++_stats.partition_hits;
 }

-void cache_tracker::on_miss() {
-    ++_stats.misses;
+void cache_tracker::on_partition_miss() {
+    ++_stats.partition_misses;
+}
+
+void cache_tracker::on_row_hit() {
+    ++_stats.row_hits;
+}
+
+void cache_tracker::on_row_miss() {
+    ++_stats.row_misses;
 }

 void cache_tracker::on_mispopulate() {
@@ -348,14 +366,30 @@ void cache_tracker::clear_continuity(cache_entry& ce) {
    ce.set_continuous(false);
 }

-void row_cache::on_hit() {
-    _stats.hits.mark();
-    _tracker.on_hit();
+void row_cache::on_partition_hit() {
+    _tracker.on_partition_hit();
 }

-void row_cache::on_miss() {
+void row_cache::on_partition_miss() {
+    _tracker.on_partition_miss();
+}
+
+void row_cache::on_row_hit() {
+    _stats.hits.mark();
+    _tracker.on_row_hit();
+}
+
+void row_cache::on_mispopulate() {
+    _tracker.on_mispopulate();
+}
+
+void row_cache::on_row_miss() {
    _stats.misses.mark();
-    _tracker.on_miss();
+    _tracker.on_row_miss();
+}
+
+void row_cache::on_row_insert() {
+    ++_tracker._stats.row_insertions;
 }

 class range_populating_reader {
@@ -369,6 +403,7 @@ private:
    }
    void handle_end_of_stream() {
        if (!can_set_continuity()) {
+            _cache.on_mispopulate();
            return;
        }
        if (!_reader.range().end() || !_reader.range().end()->is_inclusive()) {
@@ -379,11 +414,15 @@ private:
                if (it == _cache._partitions.begin()) {
                    if (!_last_key->_key) {
                        it->set_continuous(true);
+                    } else {
+                        _cache.on_mispopulate();
                    }
                } else {
                    auto prev = std::prev(it);
                    if (prev->key().equal(*_cache._schema, *_last_key->_key)) {
                        it->set_continuous(true);
+                    } else {
+                        _cache.on_mispopulate();
                    }
                }
            }
@@ -403,17 +442,17 @@ public:
                    handle_end_of_stream();
                    return std::move(smopt);
                }
-                _cache.on_miss();
+                _cache.on_partition_miss();
                if (_reader.creation_phase() == _cache.phase_of(smopt->decorated_key())) {
                    return _cache._read_section(_cache._tracker.region(), [&] {
                        cache_entry& e = _cache.find_or_create(smopt->decorated_key(), smopt->partition_tombstone(), _reader.creation_phase(),
                            can_set_continuity() ? &*_last_key : nullptr);
-                        _last_key = smopt->decorated_key();
+                        _last_key = row_cache::previous_entry_pointer(smopt->decorated_key());
                        return e.read(_cache, _read_context, std::move(*smopt), _reader.creation_phase());
                    });
                } else {
                    _cache._tracker.on_mispopulate();
-                    _last_key = smopt->decorated_key();
+                    _last_key = row_cache::previous_entry_pointer(smopt->decorated_key());
                    return read_directly_from_underlying(std::move(*smopt), _read_context);
                }
            }
@@ -424,7 +463,7 @@ public:
        if (!pr.start()) {
            _last_key = row_cache::previous_entry_pointer();
        } else if (!pr.start()->is_inclusive() && pr.start()->value().has_key()) {
-            _last_key = pr.start()->value().as_decorated_key();
+            _last_key = row_cache::previous_entry_pointer(pr.start()->value().as_decorated_key());
        } else {
            // Inclusive start bound, cannot set continuity flag.
            _last_key = {};
@@ -448,7 +487,7 @@ private:
    streamed_mutation read_from_entry(cache_entry& ce) {
        _cache.upgrade_entry(ce);
        _cache._tracker.touch(ce);
-        _cache.on_hit();
+        _cache.on_partition_hit();
        return ce.read(_cache, *_read_context);
    }

@@ -469,7 +508,7 @@ private:
                    }
                    cache_entry& e = _primary.entry();
                    auto sm = read_from_entry(e);
-                    _lower_bound = {e.key(), false};
+                    _lower_bound = dht::partition_range::bound{e.key(), false};
                    // Delay the call to next() so that we don't see stale continuity on next invocation.
                    _advance_primary = true;
                    return streamed_mutation_opt(std::move(sm));
@@ -478,7 +517,7 @@ private:
                        cache_entry& e = _primary.entry();
                        _secondary_range = dht::partition_range(_lower_bound ? std::move(_lower_bound) : _pr->start(),
                            dht::partition_range::bound{e.key(), false});
-                        _lower_bound = {e.key(), true};
+                        _lower_bound = dht::partition_range::bound{e.key(), true};
                        _secondary_in_progress = true;
                        return stdx::nullopt;
                    } else {
@@ -487,7 +526,7 @@ private:
                        if (!range) {
                            return stdx::nullopt;
                        }
-                        _lower_bound = {dht::ring_position::max()};
+                        _lower_bound = dht::partition_range::bound{dht::ring_position::max()};
                        _secondary_range = std::move(*range);
                        _secondary_in_progress = true;
                        return stdx::nullopt;
@@ -570,10 +609,10 @@ row_cache::make_reader(schema_ptr s,
                cache_entry& e = *i;
                _tracker.touch(e);
                upgrade_entry(e);
-                on_hit();
+                on_partition_hit();
                return make_reader_returning(e.read(*this, *ctx));
            } else {
-                on_miss();
+                on_partition_miss();
                return make_mutation_reader<single_partition_populating_reader>(*this, std::move(ctx));
            }
          });
@@ -629,6 +668,8 @@ cache_entry& row_cache::do_find_or_create_entry(const dht::decorated_key& key,
                    || (previous->_key && i != _partitions.begin()
                        && std::prev(i)->key().equal(*_schema, *previous->_key))) {
                    i->set_continuous(true);
+                } else {
+                    on_mispopulate();
                }

                return *i;
@@ -642,6 +683,7 @@ cache_entry& row_cache::find_or_create(const dht::decorated_key& key, tombstone
        _tracker.insert(*entry);
        return _partitions.insert(i, *entry);
    }, [&] (auto i) { // visit
+        _tracker.on_miss_already_populated();
        cache_entry& e = *i;
        e.partition().open_version(*e.schema(), phase).partition().apply(t);
        _tracker.touch(e);
@@ -760,7 +802,7 @@ future<> row_cache::do_update(memtable& m, Updater updater) {
                            if (m.partitions.empty()) {
                                _prev_snapshot_pos = {};
                            } else {
-                                _prev_snapshot_pos = m.partitions.begin()->key();
+                                _prev_snapshot_pos = dht::ring_position(m.partitions.begin()->key());
                            }
                        });
                        STAP_PROBE1(scylla, row_cache_update_one_batch_end, quota_before - quota);
@@ -790,13 +832,12 @@ future<> row_cache::update(memtable& m, partition_presence_checker is_present) {
            entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema());
            _tracker.touch(entry);
            _tracker.on_merge();
-        } else if (is_present(mem_e.key()) == partition_presence_checker_result::definitely_doesnt_exist) {
+        } else if (cache_i->continuous() || is_present(mem_e.key()) == partition_presence_checker_result::definitely_doesnt_exist) {
            cache_entry* entry = current_allocator().construct<cache_entry>(
                mem_e.schema(), std::move(mem_e.key()), std::move(mem_e.partition()));
+            entry->set_continuous(cache_i->continuous());
            _tracker.insert(*entry);
            _partitions.insert(cache_i, *entry);
-        } else {
-            _tracker.clear_continuity(*cache_i);
        }
    });
 }
@@ -815,6 +856,10 @@ future<> row_cache::update_invalidating(memtable& m) {
    });
 }

+void row_cache::refresh_snapshot() {
+    _underlying = _snapshot_source();
+}
+
 void row_cache::touch(const dht::decorated_key& dk) {
 _read_section(_tracker.region(), [&] {
  with_linearized_managed_bytes([&] {
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -185,23 +185,35 @@ public:
    using lru_type = bi::list<cache_entry,
        bi::member_hook<cache_entry, cache_entry::lru_link_type, &cache_entry::_lru_link>,
        bi::constant_time_size<false>>; // we need this to have bi::auto_unlink on hooks.
-private:
-    // We will try to evict large partition after that many normal evictions
-    const uint32_t _normal_large_eviction_ratio = 1000;
-    // Number of normal evictions to perform before we try to evict large partition
-    uint32_t _normal_eviction_count = _normal_large_eviction_ratio;
 public:
+    friend class row_cache;
+    friend class cache::read_context;
+    friend class cache::autoupdating_underlying_reader;
+    friend class cache::cache_streamed_mutation;
    struct stats {
-        uint64_t hits;
-        uint64_t misses;
-        uint64_t insertions;
+        uint64_t partition_hits;
+        uint64_t partition_misses;
+        uint64_t row_hits;
+        uint64_t row_misses;
+        uint64_t partition_insertions;
+        uint64_t row_insertions;
        uint64_t concurrent_misses_same_key;
-        uint64_t merges;
-        uint64_t evictions;
-        uint64_t removals;
+        uint64_t partition_merges;
+        uint64_t partition_evictions;
+        uint64_t partition_removals;
        uint64_t partitions;
        uint64_t modification_count;
        uint64_t mispopulations;
+        uint64_t underlying_recreations;
+        uint64_t underlying_partition_skips;
+        uint64_t underlying_row_skips;
+        uint64_t reads;
+        uint64_t reads_with_misses;
+        uint64_t reads_done;
+
+        uint64_t active_reads() const {
+            return reads_done - reads;
+        }
    };
 private:
    stats _stats{};
@@ -219,8 +231,10 @@ public:
    void clear_continuity(cache_entry& ce);
    void on_erase();
    void on_merge();
-    void on_hit();
-    void on_miss();
+    void on_partition_hit();
+    void on_partition_miss();
+    void on_row_hit();
+    void on_row_miss();
    void on_miss_already_populated();
    void on_mispopulate();
    allocation_strategy& allocator();
@@ -263,6 +277,8 @@ public:
    struct stats {
        utils::timed_rate_moving_average hits;
        utils::timed_rate_moving_average misses;
+        utils::timed_rate_moving_average reads_with_misses;
+        utils::timed_rate_moving_average reads_with_no_misses;
    };
 private:
    cache_tracker& _tracker;
@@ -313,8 +329,12 @@ private:
    logalloc::allocating_section _read_section;
    mutation_reader create_underlying_reader(cache::read_context&, mutation_source&, const dht::partition_range&);
    mutation_reader make_scanning_reader(const dht::partition_range&, lw_shared_ptr<cache::read_context>);
-    void on_hit();
-    void on_miss();
+    void on_partition_hit();
+    void on_partition_miss();
+    void on_row_hit();
+    void on_row_miss();
+    void on_row_insert();
+    void on_mispopulate();
    void upgrade_entry(cache_entry&);
    void invalidate_locked(const dht::decorated_key&);
    void invalidate_unwrapped(const dht::partition_range&);
@@ -422,6 +442,10 @@ public:
    // as few elements as possible.
    future<> update_invalidating(memtable&);

+    // Refreshes snapshot. Must only be used if logical state in the underlying data
+    // source hasn't changed.
+    void refresh_snapshot();
+
    // Moves given partition to the front of LRU if present in cache.
    void touch(const dht::decorated_key&);

@@ -449,7 +473,7 @@ public:
    // If it did, use invalidate() instead.
    void evict(const dht::partition_range& = query::full_partition_range);

-    auto num_entries() const {
+    size_t partitions() const {
        return _partitions.size();
    }
    const cache_tracker& get_cache_tracker() const {
--- a/schema.cc
+++ b/schema.cc
@@ -105,6 +105,97 @@ schema::make_column_specification(const column_definition& def) {
    return ::make_shared<cql3::column_specification>(_raw._ks_name, _raw._cf_name, std::move(id), def.type);
 }

+v3_columns::v3_columns(std::vector<column_definition> cols, bool is_dense, bool is_compound)
+    : _is_dense(is_dense)
+    , _is_compound(is_compound)
+    , _columns(std::move(cols))
+{
+    for (column_definition& def : _columns) {
+        _columns_by_name[def.name()] = &def;
+    }
+}
+
+v3_columns v3_columns::from_v2_schema(const schema& s) {
+    data_type static_column_name_type = utf8_type;
+    std::vector<column_definition> cols;
+
+    if (s.is_static_compact_table()) {
+        if (s.has_static_columns()) {
+            throw std::runtime_error(
+                sprint("v2 static compact table should not have static columns: %s.%s", s.ks_name(), s.cf_name()));
+        }
+        if (s.clustering_key_size()) {
+            throw std::runtime_error(
+                sprint("v2 static compact table should not have clustering columns: %s.%s", s.ks_name(), s.cf_name()));
+        }
+        static_column_name_type = s.regular_column_name_type();
+        for (auto& c : s.all_columns()) {
+            // Note that for "static" no-clustering compact storage we use static for the defined columns
+            if (c.kind == column_kind::regular_column) {
+                auto new_def = c;
+                new_def.kind = column_kind::static_column;
+                cols.push_back(new_def);
+            } else {
+                cols.push_back(c);
+            }
+        }
+        schema_builder::default_names names(s._raw);
+        cols.emplace_back(to_bytes(names.clustering_name()), static_column_name_type, column_kind::clustering_key, 0);
+        cols.emplace_back(to_bytes(names.compact_value_name()), s.make_legacy_default_validator(), column_kind::regular_column, 0);
+    } else {
+        cols = s.all_columns();
+    }
+
+    for (column_definition& def : cols) {
+        data_type name_type = def.is_static() ? static_column_name_type : utf8_type;
+        auto id = ::make_shared<cql3::column_identifier>(def.name(), name_type);
+        def.column_specification = ::make_shared<cql3::column_specification>(s.ks_name(), s.cf_name(), std::move(id), def.type);
+    }
+
+    return v3_columns(std::move(cols), s.is_dense(), s.is_compound());
+}
+
+void v3_columns::apply_to(schema_builder& builder) const {
+    if (is_static_compact()) {
+        for (auto& c : _columns) {
+            if (c.kind == column_kind::regular_column) {
+                builder.set_default_validation_class(c.type);
+            } else if (c.kind == column_kind::static_column) {
+                auto new_def = c;
+                new_def.kind = column_kind::regular_column;
+                builder.with_column(new_def);
+            } else if (c.kind == column_kind::clustering_key) {
+                builder.set_regular_column_name_type(c.type);
+            } else {
+                builder.with_column(c);
+            }
+        }
+    } else {
+        for (auto& c : _columns) {
+            if (is_compact() && c.kind == column_kind::regular_column) {
+                builder.set_default_validation_class(c.type);
+            }
+            builder.with_column(c);
+        }
+    }
+}
+
+bool v3_columns::is_static_compact() const {
+    return !_is_dense && !_is_compound;
+}
+
+bool v3_columns::is_compact() const {
+    return _is_dense || !_is_compound;
+}
+
+const std::unordered_map<bytes, const column_definition*>& v3_columns::columns_by_name() const {
+    return _columns_by_name;
+}
+
+const std::vector<column_definition>& v3_columns::all_columns() const {
+    return _columns;
+}
+
 void schema::rebuild() {
    _partition_key_type = make_lw_shared<compound_type<>>(get_column_types(partition_key_columns()));
    _clustering_key_type = make_lw_shared<compound_prefix>(get_column_types(clustering_key_columns()));
@@ -117,10 +208,10 @@ void schema::rebuild() {
    }

    static_assert(row_column_ids_are_ordered_by_name::value, "row columns don't need to be ordered by name");
-    if (!std::is_sorted(regular_columns().begin(), regular_columns().end(), column_definition::name_comparator())) {
+    if (!std::is_sorted(regular_columns().begin(), regular_columns().end(), column_definition::name_comparator(regular_column_name_type()))) {
        throw std::runtime_error("Regular columns should be sorted by name");
    }
-    if (!std::is_sorted(static_columns().begin(), static_columns().end(), column_definition::name_comparator())) {
+    if (!std::is_sorted(static_columns().begin(), static_columns().end(), column_definition::name_comparator(static_column_name_type()))) {
        throw std::runtime_error("Static columns should be sorted by name");
    }

@@ -137,7 +228,7 @@ void schema::rebuild() {
    }

    thrift()._compound = is_compound();
-    thrift()._is_dynamic = static_columns_count() == 0;
+    thrift()._is_dynamic = clustering_key_size() > 0;

    if (is_counter()) {
        for (auto&& cdef : boost::range::join(static_columns(), regular_columns())) {
@@ -152,6 +243,8 @@ void schema::rebuild() {
            }
        }
    }
+
+    _v3_columns = v3_columns::from_v2_schema(*this);
 }

 const column_mapping& schema::get_column_mapping() const {
@@ -189,24 +282,15 @@ schema::schema(const raw_schema& raw, stdx::optional<raw_view_info> raw_view_inf
    }())
    , _regular_columns_by_name(serialized_compare(_raw._regular_column_name_type))
 {
-    struct name_compare {
-        data_type type;
-        name_compare(data_type type) : type(type) {}
-        bool operator()(const column_definition& cd1, const column_definition& cd2) const {
-            return type->less(cd1.name(), cd2.name());
-        }
-    };
-
-
    std::sort(
            _raw._columns.begin() + column_offset(column_kind::static_column),
            _raw._columns.begin()
                    + column_offset(column_kind::regular_column),
-            name_compare(utf8_type));
+            column_definition::name_comparator(static_column_name_type()));
    std::sort(
            _raw._columns.begin()
                    + column_offset(column_kind::regular_column),
-            _raw._columns.end(), name_compare(regular_column_name_type()));
+            _raw._columns.end(), column_definition::name_comparator(regular_column_name_type()));

    std::sort(_raw._columns.begin(),
              _raw._columns.begin() + column_offset(column_kind::clustering_key),
@@ -360,6 +444,7 @@ bool operator==(const schema& x, const schema& y)
        && x._raw._speculative_retry == y._raw._speculative_retry
        && x._raw._compaction_strategy == y._raw._compaction_strategy
        && x._raw._compaction_strategy_options == y._raw._compaction_strategy_options
+        && x._raw._compaction_enabled == y._raw._compaction_enabled
        && x._raw._caching_options == y._raw._caching_options
        && x._raw._dropped_columns == y._raw._dropped_columns
        && x._raw._collections == y._raw._collections
@@ -478,11 +563,10 @@ std::ostream& operator<<(std::ostream& os, const schema& s) {
    os << ",compactionStrategyOptions={";
    n = 0;
    for (auto& p : s._raw._compaction_strategy_options) {
-        if (n++ != 0) {
-            os << ", ";
-        }
        os << p.first << "=" << p.second;
+        os << ", ";
    }
+    os << "enabled=" << std::boolalpha << s._raw._compaction_enabled;
    os << "}";
    os << ",compressionParameters={";
    n = 0;
@@ -500,7 +584,6 @@ std::ostream& operator<<(std::ostream& os, const schema& s) {
    os << ",minIndexInterval=" << s._raw._min_index_interval;
    os << ",maxIndexInterval=" << s._raw._max_index_interval;
    os << ",speculativeRetry=" << s._raw._speculative_retry.to_sstring();
-    os << ",droppedColumns={}";
    os << ",triggers=[]";
    os << ",isDense=" << std::boolalpha << s._raw._is_dense;
    os << ",version=" << s.version();
@@ -642,11 +725,7 @@ schema_builder& schema_builder::without_column(bytes name)
        return column.name() == name;
    });
    assert(it != _raw._columns.end());
-    auto now = api::new_timestamp();
-    auto ret = _raw._dropped_columns.emplace(it->name_as_text(), schema::dropped_column{it->type, now});
-    if (!ret.second) {
-        ret.first->second.timestamp = std::max(ret.first->second.timestamp, now);
-    }
+    without_column(it->name_as_text(), it->type, api::new_timestamp());
    _raw._columns.erase(it);
    return *this;
 }
@@ -658,8 +737,9 @@ schema_builder& schema_builder::without_column(sstring name, api::timestamp_type
 schema_builder& schema_builder::without_column(sstring name, data_type type, api::timestamp_type timestamp)
 {
    auto ret = _raw._dropped_columns.emplace(name, schema::dropped_column{type, timestamp});
-    if (!ret.second) {
-        ret.first->second.timestamp = std::max(ret.first->second.timestamp, timestamp);
+    if (!ret.second && ret.first->second.timestamp < timestamp) {
+        ret.first->second.type = type;
+        ret.first->second.timestamp = timestamp;
    }
    return *this;
 }
@@ -751,10 +831,8 @@ sstring schema_builder::default_names::compact_value_name() {
 void schema_builder::prepare_dense_schema(schema::raw_schema& raw) {
    auto is_dense = raw._is_dense;
    auto is_compound = raw._is_compound;
-    auto is_static_compact = !is_dense && !is_compound;
    auto is_compact_table = is_dense || !is_compound;

-
    if (is_compact_table) {
        auto count_kind = [&raw](column_kind kind) {
            return std::count_if(raw._columns.begin(), raw._columns.end(), [kind](const column_definition& c) {
@@ -764,37 +842,7 @@ void schema_builder::prepare_dense_schema(schema::raw_schema& raw) {

        default_names names(raw);

-        if (is_static_compact) {
-            /**
-             * In origin v3 the general cql-ification of the "storage engine" means
-             * that "static compact" tables are expressed as all defined columns static,
-             * but with synthetic clustering + regular columns.
-             * We unfortunately need to play along with this, both because we want
-             * schema tables on disk to be compatible (and they are explicit).
-             * More to the point, we are, at least until we upgrade to version "m"
-             * sstables, stuck with having origins java tools reading our schema tables
-             * for table schemas (this btw applies to db drivers too, though maybe a little
-             * less), and it asserts badly if we don't uphold the origin table tweaks.
-             *
-             * So transform away...
-             *
-             */
-            if (!count_kind(column_kind::static_column)) {
-                assert(!count_kind(column_kind::clustering_key));
-                for (auto& c : raw._columns) {
-                    // Note that for "static" no-clustering compact storage we use static for the defined columns
-                    if (c.kind == column_kind::regular_column) {
-                        c.kind = column_kind::static_column;
-                    }
-                }
-                // Compact tables always have a clustering and a single regular value.
-                raw._columns.emplace_back(to_bytes(names.clustering_name()),
-                                utf8_type, column_kind::clustering_key, 0);
-                raw._columns.emplace_back(to_bytes(names.compact_value_name()),
-                                raw._is_counter ? counter_type : bytes_type,
-                                column_kind::regular_column, 0);
-            }
-        } else if (is_dense) {
+        if (is_dense) {
            auto regular_cols = count_kind(column_kind::regular_column);
            // In Origin, dense CFs always have at least one regular column
            if (regular_cols == 0) {
@@ -838,6 +886,10 @@ schema_ptr schema_builder::build() {
        new_raw._version = utils::UUID_gen::get_time_UUID();
    }

+    if (new_raw._is_counter) {
+        new_raw._default_validation_class = counter_type;
+    }
+
    if (_compact_storage) {
        // Dense means that no part of the comparator stores a CQL column name. This means
        // COMPACT STORAGE with at least one columnAliases (otherwise it's a thrift "static" CF).
@@ -1032,7 +1084,10 @@ schema::static_upper_bound(const bytes& name) const {
 }
 data_type
 schema::column_name_type(const column_definition& def) const {
-    return def.kind == column_kind::regular_column ? _raw._regular_column_name_type : utf8_type;
+    if (def.kind == column_kind::regular_column) {
+        return _raw._regular_column_name_type;
+    }
+    return utf8_type;
 }

 const column_definition&
@@ -1043,6 +1098,14 @@ schema::regular_column_at(column_id id) const {
    return _raw._columns.at(column_offset(column_kind::regular_column) + id);
 }

+const column_definition&
+schema::clustering_column_at(column_id id) const {
+    if (id >= clustering_key_size()) {
+        throw std::out_of_range(sprint("clustering column id %d >= %d", id, clustering_key_size()));
+    }
+    return _raw._columns.at(column_offset(column_kind::clustering_key) + id);
+}
+
 const column_definition&
 schema::static_column_at(column_id id) const {
    if (id > static_columns_count()) {
@@ -1119,12 +1182,8 @@ schema::select_order_range schema::all_columns_in_select_order() const {
                    _raw._columns.begin() + (is_static_compact_table ?
                                    column_offset(column_kind::clustering_key) :
                                    column_offset(column_kind::static_column)));
-    auto ck_v_range =
-                    (is_static_compact_table || no_non_pk_columns) ?
-                                    static_columns() :
-                                    const_iterator_range_type(
-                                                    static_columns().begin(),
-                                                    all_columns().end());
+    auto ck_v_range = no_non_pk_columns ? static_columns()
+                                        : const_iterator_range_type(static_columns().begin(), all_columns().end());
    return boost::range::join(pk_range, ck_v_range);
 }

@@ -1163,23 +1222,7 @@ std::vector<sstring> schema::index_names() const {
 }

 data_type schema::make_legacy_default_validator() const {
-    if (is_counter()) {
-        return counter_type;
-    }
-    if (is_compact_table()) {
-        // See CFMetaData.
-        if (is_super()) {
-            for (auto& c : regular_columns()) {
-                if (c.name().empty()) {
-                    return c.type;
-                }
-            }
-            assert("Invalid super column table definition, no 'dynamic' map column");
-        } else {
-            return regular_columns().begin()->type;
-        }
-    }
-    return bytes_type;
+    return _raw._default_validation_class;
 }

 bool schema::is_synced() const {
--- a/schema.hh
+++ b/schema.hh
@@ -193,8 +193,10 @@ public:
 class column_definition final {
 public:
    struct name_comparator {
-        bool operator()(const column_definition& d1, const column_definition& d2) const {
-            return d1.name() < d2.name();
+        data_type type;
+        name_comparator(data_type type) : type(type) {}
+        bool operator()(const column_definition& cd1, const column_definition& cd2) const {
+            return type->less(cd1.name(), cd2.name());
        }
    };
 private:
@@ -234,6 +236,7 @@ public:
    bool is_clustering_key() const { return kind == column_kind::clustering_key; }
    bool is_primary_key() const { return kind == column_kind::partition_key || kind == column_kind::clustering_key; }
    bool is_atomic() const { return _is_atomic; }
+    bool is_multi_cell() const { return !_is_atomic; }
    bool is_counter() const { return _is_counter; }
    const sstring& name_as_text() const;
    const bytes& name() const;
@@ -378,12 +381,40 @@ std::ostream& operator<<(std::ostream& os, const raw_view_info& view);

 class view_info;

+// Represents a column set which is compactible with Cassandra 3.x.
+//
+// This layout differs from the layout Scylla uses in schema/schema_builder for static compact tables.
+// For such tables, Scylla expects all columns to be of regular type and no clustering columns,
+// whereas in v3 those columns are static and there is a clustering column with type matching the
+// cell name comparator and a regular column with type matching the default validator.
+// See issues #2555 and #1474.
+class v3_columns {
+    bool _is_dense = false;
+    bool _is_compound = false;
+    std::vector<column_definition> _columns;
+    std::unordered_map<bytes, const column_definition*> _columns_by_name;
+public:
+    v3_columns(std::vector<column_definition> columns, bool is_dense, bool is_compound);
+    v3_columns() = default;
+    v3_columns(v3_columns&&) = default;
+    v3_columns& operator=(v3_columns&&) = default;
+    v3_columns(const v3_columns&) = delete;
+    static v3_columns from_v2_schema(const schema&);
+public:
+    const std::vector<column_definition>& all_columns() const;
+    const std::unordered_map<bytes, const column_definition*>& columns_by_name() const;
+    bool is_static_compact() const;
+    bool is_compact() const;
+    void apply_to(schema_builder&) const;
+};
+
 /*
 * Effectively immutable.
 * Not safe to access across cores because of shared_ptr's.
 * Use global_schema_ptr for safe across-shard access.
 */
 class schema final : public enable_lw_shared_from_this<schema> {
+    friend class v3_columns;
 public:
    struct dropped_column {
        data_type type;
@@ -406,6 +437,7 @@ private:
        sstring _comment;
        gc_clock::duration _default_time_to_live = gc_clock::duration::zero();
        data_type _regular_column_name_type;
+        data_type _default_validation_class = bytes_type;
        double _bloom_filter_fp_chance = 0.01;
        compression_parameters _compressor_params;
        bool _is_dense = false;
@@ -426,6 +458,7 @@ private:
        // we will use by default - when we have the choice.
        sstables::compaction_strategy_type _compaction_strategy = sstables::compaction_strategy_type::size_tiered;
        std::map<sstring, sstring> _compaction_strategy_options;
+        bool _compaction_enabled = true;
        caching_options _caching_options;
        table_schema_version _version;
        std::unordered_map<sstring, dropped_column> _dropped_columns;
@@ -434,6 +467,7 @@ private:
    };
    raw_schema _raw;
    thrift_schema _thrift;
+    v3_columns _v3_columns;
    mutable schema_registry_entry* _registry_entry = nullptr;
    std::unique_ptr<::view_info> _view_info;

@@ -570,14 +604,22 @@ public:
        return _raw._memtable_flush_period;
    }

-    sstables::compaction_strategy_type compaction_strategy() const {
+    sstables::compaction_strategy_type configured_compaction_strategy() const {
        return _raw._compaction_strategy;
    }

+    sstables::compaction_strategy_type compaction_strategy() const {
+        return _raw._compaction_enabled ? _raw._compaction_strategy : sstables::compaction_strategy_type::null;
+    }
+
    const std::map<sstring, sstring>& compaction_strategy_options() const {
        return _raw._compaction_strategy_options;
    }

+    bool compaction_enabled() const {
+        return _raw._compaction_enabled;
+    }
+
    const ::speculative_retry& speculative_retry() const {
        return _raw._speculative_retry;
    }
@@ -597,6 +639,7 @@ public:
    const_iterator static_lower_bound(const bytes& name) const;
    const_iterator static_upper_bound(const bytes& name) const;
    data_type column_name_type(const column_definition& def) const;
+    const column_definition& clustering_column_at(column_id id) const;
    const column_definition& regular_column_at(column_id id) const;
    const column_definition& static_column_at(column_id id) const;
    bool is_last_partition_key(const column_definition& def) const;
@@ -662,6 +705,9 @@ public:
    const data_type& regular_column_name_type() const {
        return _raw._regular_column_name_type;
    }
+    const data_type& static_column_name_type() const {
+        return utf8_type;
+    }
    const std::unique_ptr<::view_info>& view_info() const {
        return _view_info;
    }
@@ -689,6 +735,10 @@ public:
    // recent as this version.
    bool is_synced() const;
    bool equal_columns(const schema&) const;
+public:
+    const v3_columns& v3() const {
+        return _v3_columns;
+    }
 };

 bool operator==(const schema&, const schema&);
--- a/schema_builder.hh
+++ b/schema_builder.hh
@@ -50,6 +50,10 @@ public:
        _raw._regular_column_name_type = t;
        return *this;
    }
+    schema_builder& set_default_validation_class(const data_type& t) {
+        _raw._default_validation_class = t;
+        return *this;
+    }
    const data_type& regular_column_name_type() const {
        return _raw._regular_column_name_type;
    }
@@ -128,6 +132,15 @@ public:
        return _raw._max_compaction_threshold;
    }

+    schema_builder& set_compaction_enabled(bool enabled) {
+        _raw._compaction_enabled = enabled;
+        return *this;
+    }
+
+    bool compaction_enabled() const {
+        return _raw._compaction_enabled;
+    }
+
    schema_builder& set_min_index_interval(int32_t t) {
        _raw._min_index_interval = t;
        return *this;
@@ -246,6 +259,10 @@ public:
    schema_builder& with_index(const index_metadata& im);
    schema_builder& without_index(const sstring& name);

+    default_names get_default_names() const {
+        return default_names(_raw);
+    }
+
    // Equivalent to with(cp).build()
    schema_ptr build(compact_storage cp);

--- a/schema_mutations.cc
+++ b/schema_mutations.cc
@@ -28,42 +28,75 @@ schema_mutations::schema_mutations(canonical_mutation columnfamilies,
                                   canonical_mutation columns,
                                   bool is_view,
                                   stdx::optional<canonical_mutation> indices,
-                                   stdx::optional<canonical_mutation> dropped_columns)
+                                   stdx::optional<canonical_mutation> dropped_columns,
+                                   stdx::optional<canonical_mutation> scylla_tables)
    : _columnfamilies(columnfamilies.to_mutation(is_view ? db::schema_tables::views() : db::schema_tables::tables()))
    , _columns(columns.to_mutation(db::schema_tables::columns()))
-    , _indices(indices ? stdx::optional<mutation>{indices.value().to_mutation(db::schema_tables::indexes())} : stdx::nullopt)
-    , _dropped_columns(dropped_columns ? stdx::optional<mutation>{dropped_columns.value().to_mutation(db::schema_tables::dropped_columns())} : stdx::nullopt)
+    , _indices(indices ? mutation_opt{indices.value().to_mutation(db::schema_tables::indexes())} : stdx::nullopt)
+    , _dropped_columns(dropped_columns ? mutation_opt{dropped_columns.value().to_mutation(db::schema_tables::dropped_columns())} : stdx::nullopt)
+    , _scylla_tables(scylla_tables ? mutation_opt{scylla_tables.value().to_mutation(db::schema_tables::scylla_tables())} : stdx::nullopt)
 {}

 void schema_mutations::copy_to(std::vector<mutation>& dst) const {
    dst.push_back(_columnfamilies);
    dst.push_back(_columns);
    if (_indices) {
-        dst.push_back(_indices.value());
+        dst.push_back(*_indices);
    }
    if (_dropped_columns) {
-        dst.push_back(_dropped_columns.value());
+        dst.push_back(*_dropped_columns);
+    }
+    if (_scylla_tables) {
+        dst.push_back(*_scylla_tables);
    }
 }

 table_schema_version schema_mutations::digest() const {
+    if (_scylla_tables) {
+        auto rs = query::result_set(*_scylla_tables);
+        if (!rs.empty()) {
+            auto&& row = rs.row(0);
+            if (row.has("version")) {
+                auto val = row.get<utils::UUID>("version");
+                if (val) {
+                    return *val;
+                }
+            }
+        }
+    }
+
    md5_hasher h;
    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies);
    db::schema_tables::feed_hash_for_schema_digest(h, _columns);
-    if (_indices && !_indices.value().partition().empty()) {
-        db::schema_tables::feed_hash_for_schema_digest(h, _indices.value());
+    if (_indices && !_indices->partition().empty()) {
+        db::schema_tables::feed_hash_for_schema_digest(h, *_indices);
    }
-    if (_dropped_columns && !_dropped_columns.value().partition().empty()) {
-        db::schema_tables::feed_hash_for_schema_digest(h, _dropped_columns.value());
+    if (_dropped_columns && !_dropped_columns->partition().empty()) {
+        db::schema_tables::feed_hash_for_schema_digest(h, *_dropped_columns);
+    }
+    if (_scylla_tables) {
+        db::schema_tables::feed_hash_for_schema_digest(h, *_scylla_tables);
    }
    return utils::UUID_gen::get_name_UUID(h.finalize());
 }

+static mutation_opt compact(const mutation_opt& m) {
+    if (!m) {
+        return m;
+    }
+    return db::schema_tables::compact_for_schema_digest(*m);
+}
+
+static mutation_opt compact(const mutation& m) {
+    return db::schema_tables::compact_for_schema_digest(m);
+}
+
 bool schema_mutations::operator==(const schema_mutations& other) const {
-    return _columnfamilies == other._columnfamilies
-           && _columns == other._columns
-           && _indices == other._indices
-           && _dropped_columns == other._dropped_columns
+    return compact(_columnfamilies) == compact(other._columnfamilies)
+           && compact(_columns) == compact(other._columns)
+           && compact(_indices) == compact(other._indices)
+           && compact(_dropped_columns) == compact(other._dropped_columns)
+           && compact(_scylla_tables) == compact(other._scylla_tables)
           ;
 }

--- a/schema_mutations.hh
+++ b/schema_mutations.hh
@@ -27,23 +27,28 @@
 #include "canonical_mutation.hh"

 // Commutative representation of table schema
+// Equality ignores tombstones.
 class schema_mutations {
    mutation _columnfamilies;
    mutation _columns;
-    stdx::optional<mutation> _indices;
-    stdx::optional<mutation> _dropped_columns;
+    mutation_opt _indices;
+    mutation_opt _dropped_columns;
+    mutation_opt _scylla_tables;
 public:
-    schema_mutations(mutation columnfamilies, mutation columns, stdx::optional<mutation> indices, stdx::optional<mutation> dropped_columns)
+    schema_mutations(mutation columnfamilies, mutation columns, mutation_opt indices, mutation_opt dropped_columns,
+        mutation_opt scylla_tables)
            : _columnfamilies(std::move(columnfamilies))
            , _columns(std::move(columns))
            , _indices(std::move(indices))
            , _dropped_columns(std::move(dropped_columns))
+            , _scylla_tables(std::move(scylla_tables))
    { }
    schema_mutations(canonical_mutation columnfamilies,
                     canonical_mutation columns,
                     bool is_view,
                     stdx::optional<canonical_mutation> indices,
-                     stdx::optional<canonical_mutation> dropped_columns);
+                     stdx::optional<canonical_mutation> dropped_columns,
+                     stdx::optional<canonical_mutation> scylla_tables);

    schema_mutations(schema_mutations&&) = default;
    schema_mutations& operator=(schema_mutations&&) = default;
@@ -60,10 +65,18 @@ public:
        return _columns;
    }

-    const stdx::optional<mutation>& indices_mutation() const {
+    const mutation_opt& scylla_tables() const {
+        return _scylla_tables;
+    }
+
+    mutation_opt& scylla_tables() {
+        return _scylla_tables;
+    }
+
+    const mutation_opt& indices_mutation() const {
        return _indices;
    }
-    const stdx::optional<mutation>& dropped_columns_mutation() const {
+    const mutation_opt& dropped_columns_mutation() const {
        return _dropped_columns;
    }

@@ -77,13 +90,19 @@ public:

    stdx::optional<canonical_mutation> indices_canonical_mutation() const {
        if (_indices) {
-            return canonical_mutation(_indices.value());
+            return canonical_mutation(*_indices);
        }
        return {};
    }
    stdx::optional<canonical_mutation> dropped_columns_canonical_mutation() const {
        if (_dropped_columns) {
-            return canonical_mutation(_dropped_columns.value());
+            return canonical_mutation(*_dropped_columns);
+        }
+        return {};
+    }
+    stdx::optional<canonical_mutation> scylla_tables_canonical_mutation() const {
+        if (_scylla_tables) {
+            return canonical_mutation(*_scylla_tables);
        }
        return {};
    }
--- a/schema_registry.cc
+++ b/schema_registry.cc
@@ -48,7 +48,17 @@ schema_registry_entry::schema_registry_entry(table_schema_version v, schema_regi
    , _version(v)
    , _registry(r)
    , _sync_state(sync_state::NOT_SYNCED)
-{ }
+{
+    _erase_timer.set_callback([this] {
+        slogger.debug("Dropping {}", _version);
+        assert(!_schema);
+        try {
+            _registry._entries.erase(_version);
+        } catch (...) {
+            slogger.error("Failed to erase schema version {}: {}", _version, std::current_exception());
+        }
+    });
+}

 schema_ptr schema_registry::learn(const schema_ptr& s) {
    if (s->registry_entry()) {
@@ -173,6 +183,7 @@ schema_ptr schema_registry_entry::get_schema() {
        if (s->version() != _version) {
            throw std::runtime_error(sprint("Unfrozen schema version doesn't match entry version (%s): %s", _version, *s));
        }
+        _erase_timer.cancel();
        s->_registry_entry = this;
        _schema = &*s;
        return s;
@@ -184,12 +195,7 @@ schema_ptr schema_registry_entry::get_schema() {
 void schema_registry_entry::detach_schema() noexcept {
    slogger.trace("Deactivating {}", _version);
    _schema = nullptr;
-    // TODO: keep the entry for a while (timer)
-    try {
-        _registry._entries.erase(_version);
-    } catch (...) {
-        slogger.error("Failed to erase schema version {}: {}", _version, std::current_exception());
-    }
+    _erase_timer.arm(_registry.grace_period());
 }

 frozen_schema schema_registry_entry::frozen() const {
@@ -273,9 +279,9 @@ schema_ptr global_schema_ptr::get() const {
            s = local_schema_registry().get_or_load(e.version(), [&e](table_schema_version) {
                return e.frozen();
            });
-            if (e.is_synced()) {
-                s->registry_entry()->mark_synced();
-            }
+        }
+        if (e.is_synced()) {
+            s->registry_entry()->mark_synced();
        }
        return s;
    }
--- a/schema_registry.hh
+++ b/schema_registry.hh
@@ -55,6 +55,8 @@ public:
 // In addition to the above the entry is controlled by lw_shared_ptr<> to cope with races between loaders.
 //
 class schema_registry_entry : public enable_lw_shared_from_this<schema_registry_entry> {
+    using erase_clock = seastar::lowres_clock;
+
    enum class state {
        INITIAL, LOADING, LOADED
    };
@@ -74,6 +76,7 @@ class schema_registry_entry : public enable_lw_shared_from_this<schema_registry_
    enum class sync_state { NOT_SYNCED, SYNCING, SYNCED };
    sync_state _sync_state;
    shared_promise<> _synced_promise; // valid when _sync_state == SYNCING
+    timer<erase_clock> _erase_timer;

    friend class schema_registry;
 public:
@@ -110,6 +113,11 @@ class schema_registry {
    std::unordered_map<table_schema_version, lw_shared_ptr<schema_registry_entry>> _entries;
    friend class schema_registry_entry;
    schema_registry_entry& get_entry(table_schema_version) const;
+    // Duration for which unused entries are kept alive to avoid
+    // too frequent re-requests and syncs.
+    schema_registry_entry::erase_clock::duration grace_period() const {
+        return std::chrono::seconds(1);
+    }
 public:
    // Looks up schema by version or loads it using supplied loader.
    schema_ptr get_or_load(table_schema_version, const schema_loader&);
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -190,7 +190,7 @@ class scylla_column_families(gdb.Command):
            db = find_db(shard)
            cfs = db['_column_families']
            for (key, value) in list_unordered_map(cfs):
-                value = value['_p']['_value']  # it's a lw_shared_ptr
+                value = value['_p'].reinterpret_cast(gdb.lookup_type('column_family').pointer()).dereference()  # it's a lw_shared_ptr
                schema = value['_schema']['_p'].reinterpret_cast(gdb.lookup_type('schema').pointer())
                name = str(schema['_raw']['_ks_name']) + '/' + str(schema['_raw']['_cf_name'])
                schema_version = str(schema['_raw']['_version'])
--- a/2
+++ b/2
--- a/service/client_state.cc
+++ b/service/client_state.cc
@@ -133,7 +133,7 @@ future<> service::client_state::has_access(const sstring& ks, auth::permission p
        // prevent system keyspace modification
        auto name = ks;
        std::transform(name.begin(), name.end(), name.begin(), ::tolower);
-        if (name == db::system_keyspace::NAME) {
+        if (is_system_keyspace(name)) {
            throw exceptions::unauthorized_exception(ks + " keyspace is not user-modifiable.");
        }

--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -86,7 +86,12 @@ void migration_manager::init_messaging_service()
        });
        return netw::messaging_service::no_wait();
    });
-    ms.register_migration_request([this] () {
+    ms.register_migration_request([this] (const rpc::client_info& cinfo) {
+        auto src = netw::messaging_service::get_source(cinfo);
+        if (!has_compatible_schema_tables_version(src.addr)) {
+            mlogger.debug("Ignoring schema request from incompatible node: {}", src);
+            return make_ready_future<std::vector<frozen_mutation>>(std::vector<frozen_mutation>());
+        }
        return db::schema_tables::convert_schema_to_mutations(get_storage_proxy()).finally([p = get_local_shared_storage_proxy()] {
            // keep local proxy alive
        });
@@ -133,12 +138,15 @@ bool migration_manager::is_ready_for_bootstrap() {
        if (endpoint == utils::fb_utilities::get_broadcast_address() || !eps.is_alive()) {
            continue;
        }
+        mlogger.debug("Checking schema state for {}.", endpoint);
        auto schema = eps.get_application_state(gms::application_state::SCHEMA);
        if (!schema) {
+            mlogger.debug("Schema state not yet available for {}.", endpoint);
            return false;
        }
        utils::UUID remote_version{schema->value};
        if (our_version != remote_version) {
+            mlogger.debug("Schema mismatch for {} ({} != {}).", endpoint, our_version, remote_version);
            return false;
        } else {
            match = true;
@@ -155,11 +163,21 @@ future<> migration_manager::maybe_schedule_schema_pull(const utils::UUID& their_
 {
    auto& proxy = get_local_storage_proxy();
    auto& db = proxy.get_db().local();
+    auto& ss = get_storage_service().local();
    if (db.get_version() == their_version || !should_pull_schema_from(endpoint)) {
        mlogger.debug("Not pulling schema because versions match or shouldPullSchemaFrom returned false");
        return make_ready_future<>();
    }

+    // Disable pulls during rolling upgrade from 1.7 to 2.0 to avoid
+    // schema version inconsistency. See https://github.com/scylladb/scylla/issues/2802.
+    if (!ss.cluster_supports_schema_tables_v3()) {
+        mlogger.debug("Delaying pull with {} until cluster upgrade is complete", endpoint);
+        return ss.cluster_supports_schema_tables_v3().when_enabled().then([this, their_version, endpoint] {
+            return maybe_schedule_schema_pull(their_version, endpoint);
+        });
+    }
+
    if (db.get_version() == database::empty_version || runtime::get_uptime() < migration_delay) {
        // If we think we may be bootstrapping or have recently started, submit MigrationTask immediately
        mlogger.debug("Submitting migration task for {}", endpoint);
@@ -220,15 +238,18 @@ future<> migration_manager::merge_schema_from(netw::messaging_service::msg_addr
    });
 }

-bool migration_manager::should_pull_schema_from(const gms::inet_address& endpoint)
-{
-    /*
-     * Don't request schema from nodes with a differnt or unknonw major version (may have incompatible schema)
-     * Don't request schema from fat clients
-     */
-    auto& ms = netw::get_local_messaging_service();
-    return ms.knows_version(endpoint)
-            && ms.get_raw_version(endpoint) == netw::messaging_service::current_version
+bool migration_manager::has_compatible_schema_tables_version(const gms::inet_address& endpoint) {
+    auto& gossiper = gms::get_local_gossiper();
+    auto ep_state = gossiper.get_endpoint_state_for_endpoint(endpoint);
+    if (!ep_state) {
+        return false;
+    }
+    auto&& version_opt = ep_state->get_application_state(gms::application_state::SCHEMA_TABLES_VERSION);
+    return version_opt && version_opt->value == db::schema_tables::version;
+}
+
+bool migration_manager::should_pull_schema_from(const gms::inet_address& endpoint) {
+    return has_compatible_schema_tables_version(endpoint)
            && !gms::get_local_gossiper().is_gossip_only_member(endpoint);
 }

--- a/service/migration_manager.hh
+++ b/service/migration_manager.hh
@@ -94,6 +94,7 @@ public:
    future<> notify_drop_view(const view_ptr& view);

    bool should_pull_schema_from(const gms::inet_address& endpoint);
+    bool has_compatible_schema_tables_version(const gms::inet_address& endpoint);

    future<> announce_keyspace_update(lw_shared_ptr<keyspace_metadata> ksm, bool announce_locally = false);

--- a/service/misc_services.cc
+++ b/service/misc_services.cc
@@ -127,7 +127,7 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
        return boost::copy_range<std::unordered_map<utils::UUID, stat>>(db.get_column_families() | boost::adaptors::filtered(non_system_filter) |
                boost::adaptors::transformed([]  (const std::pair<utils::UUID, lw_shared_ptr<column_family>>& cf) {
            auto& stats = cf.second->get_row_cache().stats();
-            return std::make_pair(cf.first, stat{float(stats.hits.rate().rates[0]), float(stats.misses.rate().rates[0])});
+            return std::make_pair(cf.first, stat{float(stats.reads_with_no_misses.rate().rates[0]), float(stats.reads_with_misses.rate().rates[0])});
        }));
    };

--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -2075,6 +2075,7 @@ private:
                break;
            }
        }
+        assert(last_partition);
        return get_last_row(s, *last_partition, is_reversed);
    }

@@ -2300,6 +2301,10 @@ public:
                    v.emplace_back(r.from, stdx::optional<partition>(), r.reached_end, true);
                }
            }
+
+            boost::sort(v, [] (const version& x, const version& y) {
+                return x.from < y.from;
+            });
        } while(true);

        std::vector<mutation_and_live_row_count> reconciled_partitions;
@@ -2308,7 +2313,10 @@ public:
        // reconcile all versions
        boost::range::transform(boost::make_iterator_range(versions.begin(), versions.end()), std::back_inserter(reconciled_partitions),
                                [this, schema, original_per_partition_limit] (std::vector<version>& v) {
-            auto m = boost::accumulate(v, mutation(v.front().par->mut().key(*schema), schema), [this, schema] (mutation& m, const version& ver) {
+            auto it = boost::range::find_if(v, [] (auto&& ver) {
+                    return bool(ver.par);
+            });
+            auto m = boost::accumulate(v, mutation(it->par->mut().key(*schema), schema), [this, schema] (mutation& m, const version& ver) {
                if (ver.par) {
                    m.partition().apply(*schema, ver.par->mut().partition(), *schema);
                }
@@ -2519,8 +2527,9 @@ protected:
    virtual future<> make_requests(digest_resolver_ptr resolver, clock_type::time_point timeout) {
        resolver->add_wait_targets(_targets.size());
        auto want_digest = _targets.size() > 1;
-        return when_all(make_data_requests(resolver, _targets.begin(), _targets.begin() + 1, timeout, want_digest),
-                        make_digest_requests(resolver, _targets.begin() + 1, _targets.end(), timeout)).discard_result();
+        auto f_data = futurize_apply([&] { return make_data_requests(resolver, _targets.begin(), _targets.begin() + 1, timeout, want_digest); });
+        auto f_digest = futurize_apply([&] { return make_digest_requests(resolver, _targets.begin() + 1, _targets.end(), timeout); });
+        return when_all_succeed(std::move(f_data), std::move(f_digest)).handle_exception([] (auto&&) { });
    }
    virtual void got_cl() {}
    uint32_t original_row_limit() const {
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -85,6 +85,8 @@ static const sstring LARGE_PARTITIONS_FEATURE = "LARGE_PARTITIONS";
 static const sstring MATERIALIZED_VIEWS_FEATURE = "MATERIALIZED_VIEWS";
 static const sstring COUNTERS_FEATURE = "COUNTERS";
 static const sstring INDEXES_FEATURE = "INDEXES";
+static const sstring CORRECT_COUNTER_ORDER_FEATURE = "CORRECT_COUNTER_ORDER";
+static const sstring SCHEMA_TABLES_V3 = "SCHEMA_TABLES_V3";

 distributed<storage_service> _the_storage_service;

@@ -125,6 +127,8 @@ sstring storage_service::get_config_supported_features() {
        RANGE_TOMBSTONES_FEATURE,
        LARGE_PARTITIONS_FEATURE,
        COUNTERS_FEATURE,
+        CORRECT_COUNTER_ORDER_FEATURE,
+        SCHEMA_TABLES_V3
    };
    if (service::get_local_storage_service()._db.local().get_config().experimental()) {
        features.push_back(MATERIALIZED_VIEWS_FEATURE);
@@ -301,6 +305,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
    app_states.emplace(gms::application_state::RELEASE_VERSION, value_factory.release_version());
    app_states.emplace(gms::application_state::SUPPORTED_FEATURES, value_factory.supported_features(features));
    app_states.emplace(gms::application_state::CACHE_HITRATES, value_factory.cache_hitrates(""));
+    app_states.emplace(gms::application_state::SCHEMA_TABLES_VERSION, versioned_value(db::schema_tables::version));
    slogger.info("Starting up server gossip");

    auto& gossiper = gms::get_local_gossiper();
@@ -314,6 +319,9 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
    auto& proxy = service::get_storage_proxy();
    // gossip Schema.emptyVersion forcing immediate check for schema updates (see MigrationManager#maybeScheduleSchemaPull)
    update_schema_version_and_announce(proxy).get();// Ensure we know our own actual Schema UUID in preparation for updates
+    get_storage_service().invoke_on_all([] (auto& ss) {
+        ss.register_features();
+    }).get();
 #if 0
    if (!MessagingService.instance().isListening())
        MessagingService.instance().listen(FBUtilities.getLocalAddress());
@@ -324,6 +332,19 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
 #endif
 }

+void storage_service::register_features() {
+    _range_tombstones_feature = gms::feature(RANGE_TOMBSTONES_FEATURE);
+    _large_partitions_feature = gms::feature(LARGE_PARTITIONS_FEATURE);
+    _counters_feature = gms::feature(COUNTERS_FEATURE);
+    _correct_counter_order_feature = gms::feature(CORRECT_COUNTER_ORDER_FEATURE);
+    _schema_tables_v3 = gms::feature(SCHEMA_TABLES_V3);
+
+    if (_db.local().get_config().experimental()) {
+        _materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
+        _indexes_feature = gms::feature(INDEXES_FEATURE);
+    }
+}
+
 // Runs inside seastar::async context
 void storage_service::join_token_ring(int delay) {
    // This function only gets called on shard 0, but we want to set _joined
@@ -479,16 +500,6 @@ void storage_service::join_token_ring(int delay) {
 #endif

    if (!_is_survey_mode) {
-        // We have to create the system_auth and system_traces keyspaces and
-        // their tables before Node moves to the NORMAL state so that other
-        // Nodes joining the newly created cluster and serializing on this event
-        // "see" these new objects and don't try to create them.
-        //
-        // Otherwise there is a high chance to hit the issue #420.
-        auth::auth::setup().get();
-        supervisor::notify("starting tracing");
-        tracing::tracing::start_tracing().get();
-
        // start participating in the ring.
        db::system_keyspace::set_bootstrap_state(db::system_keyspace::bootstrap_state::COMPLETED).get();
        set_tokens(_bootstrap_tokens);
@@ -504,6 +515,9 @@ void storage_service::join_token_ring(int delay) {
            slogger.error(err.c_str());
            throw std::runtime_error(err);
        }
+        auth::auth::setup().get();
+        supervisor::notify("starting tracing");
+        tracing::tracing::start_tracing().get();
    } else {
        slogger.info("Startup complete, but write survey mode is active, not becoming an active ring member. Use JMX (StorageService->joinRing()) to finalize ring joining.");
    }
@@ -1350,17 +1364,6 @@ future<> storage_service::init_server(int delay) {
            }
            slogger.info("Not joining ring as requested. Use JMX (StorageService->joinRing()) to initiate ring joining");
        }
-
-        get_storage_service().invoke_on_all([] (auto& ss) {
-            ss._range_tombstones_feature = gms::feature(RANGE_TOMBSTONES_FEATURE);
-            ss._large_partitions_feature = gms::feature(LARGE_PARTITIONS_FEATURE);
-            ss._counters_feature = gms::feature(COUNTERS_FEATURE);
-
-            if (ss._db.local().get_config().experimental()) {
-                ss._materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
-                ss._indexes_feature = gms::feature(INDEXES_FEATURE);
-            }
-        }).get();
    });
 }

@@ -2269,7 +2272,7 @@ void storage_service::flush_column_families() {
        auto& local_db = ss.db().local();
        auto non_system_cfs = local_db.get_column_families() | boost::adaptors::filtered([] (auto& uuid_and_cf) {
            auto cf = uuid_and_cf.second;
-            return cf->schema()->ks_name() != db::system_keyspace::NAME;
+            return !is_system_keyspace(cf->schema()->ks_name());
        });
        // count CFs first
        auto total_cfs = boost::distance(non_system_cfs);
@@ -2289,7 +2292,7 @@ void storage_service::flush_column_families() {
        auto& local_db = ss.db().local();
        auto system_cfs = local_db.get_column_families() | boost::adaptors::filtered([] (auto& uuid_and_cf) {
            auto cf = uuid_and_cf.second;
-            return cf->schema()->ks_name() == db::system_keyspace::NAME;
+            return is_system_keyspace(cf->schema()->ks_name());
        });
        return parallel_for_each(system_cfs, [&ss] (auto&& uuid_and_cf) {
            auto cf = uuid_and_cf.second;
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -263,7 +263,8 @@ private:
    gms::feature _materialized_views_feature;
    gms::feature _counters_feature;
    gms::feature _indexes_feature;
-
+    gms::feature _correct_counter_order_feature;
+    gms::feature _schema_tables_v3;
 public:
    void enable_all_features() {
        _range_tombstones_feature.enable();
@@ -271,6 +272,8 @@ public:
        _materialized_views_feature.enable();
        _counters_feature.enable();
        _indexes_feature.enable();
+        _correct_counter_order_feature.enable();
+        _schema_tables_v3.enable();
    }

    void finish_bootstrapping() {
@@ -405,6 +408,7 @@ public:
 private:
    bool should_bootstrap();
    void prepare_to_join(std::vector<inet_address> loaded_endpoints);
+    void register_features();
    void join_token_ring(int delay);
 public:
    future<> join_ring();
@@ -2236,6 +2240,14 @@ public:
    bool cluster_supports_indexes() const {
        return bool(_indexes_feature);
    }
+
+    bool cluster_supports_correct_counter_order() const {
+        return bool(_correct_counter_order_feature);
+    }
+
+    const gms::feature& cluster_supports_schema_tables_v3() const {
+        return _schema_tables_v3;
+    }
 };

 inline future<> init_storage_service(distributed<database>& db) {
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -174,12 +174,14 @@ protected:
    uint64_t _estimated_partitions = 0;
    std::vector<unsigned long> _ancestors;
    db::replay_position _rp;
+    seastar::thread_scheduling_group* _tsg;
 protected:
-    compaction(column_family& cf, std::vector<shared_sstable> sstables, uint64_t max_sstable_size, uint32_t sstable_level)
+    compaction(column_family& cf, std::vector<shared_sstable> sstables, uint64_t max_sstable_size, uint32_t sstable_level, seastar::thread_scheduling_group* tsg)
        : _cf(cf)
        , _sstables(std::move(sstables))
        , _max_sstable_size(max_sstable_size)
        , _sstable_level(sstable_level)
+        , _tsg(tsg)
    {
        _cf.get_compaction_manager().register_compaction(_info);
    }
@@ -211,6 +213,12 @@ public:
    virtual ~compaction() {
        _cf.get_compaction_manager().deregister_compaction(_info);
    }
+
+    seastar::thread_attributes thread_attributes() {
+        seastar::thread_attributes attr;
+        attr.scheduling_group = _tsg;
+        return attr;
+    }
 private:
    ::mutation_reader setup() {
        std::vector<::mutation_reader> readers;
@@ -339,8 +347,8 @@ class regular_compaction : public compaction {
    stdx::optional<sstable_writer> _writer;
 public:
    regular_compaction(column_family& cf, std::vector<shared_sstable> sstables, std::function<shared_sstable()> creator,
-            uint64_t max_sstable_size, uint32_t sstable_level)
-        : compaction(cf, std::move(sstables), max_sstable_size, sstable_level)
+            uint64_t max_sstable_size, uint32_t sstable_level, seastar::thread_scheduling_group* tsg)
+        : compaction(cf, std::move(sstables), max_sstable_size, sstable_level, tsg)
        , _creator(std::move(creator))
        , _set(cf.get_sstable_set())
        , _selector(_set.make_incremental_selector())
@@ -407,8 +415,8 @@ public:
 class cleanup_compaction final : public regular_compaction {
 public:
    cleanup_compaction(column_family& cf, std::vector<shared_sstable> sstables, std::function<shared_sstable()> creator,
-            uint64_t max_sstable_size, uint32_t sstable_level)
-        : regular_compaction(cf, std::move(sstables), std::move(creator), max_sstable_size, sstable_level)
+            uint64_t max_sstable_size, uint32_t sstable_level, seastar::thread_scheduling_group* tsg)
+        : regular_compaction(cf, std::move(sstables), std::move(creator), max_sstable_size, sstable_level, tsg)
    {
        _info->type = compaction_type::Cleanup;
    }
@@ -444,8 +452,8 @@ class resharding_compaction final : public compaction {
    std::function<shared_sstable(shard_id)> _sstable_creator;
 public:
    resharding_compaction(std::vector<shared_sstable> sstables, column_family& cf, std::function<shared_sstable(shard_id)> creator,
-            uint64_t max_sstable_size, uint32_t sstable_level)
-        : compaction(cf, std::move(sstables), max_sstable_size, sstable_level)
+            uint64_t max_sstable_size, uint32_t sstable_level, seastar::thread_scheduling_group* tsg)
+        : compaction(cf, std::move(sstables), max_sstable_size, sstable_level, tsg)
        , _output_sstables(smp::count)
        , _sstable_creator(std::move(creator))
    {
@@ -494,7 +502,8 @@ public:
 };

 future<std::vector<shared_sstable>> compaction::run(std::unique_ptr<compaction> c) {
-    return seastar::async([c = std::move(c)] () mutable {
+    auto attr = c->thread_attributes();
+    return seastar::async(std::move(attr), [c = std::move(c)] () mutable {
        auto reader = c->setup();

        auto cr = c->get_compacting_sstable_writer();
@@ -527,21 +536,21 @@ static std::unique_ptr<compaction> make_compaction(bool cleanup, Params&&... par

 future<std::vector<shared_sstable>>
 compact_sstables(std::vector<shared_sstable> sstables, column_family& cf, std::function<shared_sstable()> creator,
-        uint64_t max_sstable_size, uint32_t sstable_level, bool cleanup) {
+        uint64_t max_sstable_size, uint32_t sstable_level, bool cleanup, seastar::thread_scheduling_group *tsg) {
    if (sstables.empty()) {
        throw std::runtime_error(sprint("Called compaction with empty set on behalf of {}.{}", cf.schema()->ks_name(), cf.schema()->cf_name()));
    }
-    auto c = make_compaction(cleanup, cf, std::move(sstables), std::move(creator), max_sstable_size, sstable_level);
+    auto c = make_compaction(cleanup, cf, std::move(sstables), std::move(creator), max_sstable_size, sstable_level, tsg);
    return compaction::run(std::move(c));
 }

 future<std::vector<shared_sstable>>
 reshard_sstables(std::vector<shared_sstable> sstables, column_family& cf, std::function<shared_sstable(shard_id)> creator,
-        uint64_t max_sstable_size, uint32_t sstable_level) {
+        uint64_t max_sstable_size, uint32_t sstable_level, seastar::thread_scheduling_group* tsg) {
    if (sstables.empty()) {
        throw std::runtime_error(sprint("Called resharding with empty set on behalf of {}.{}", cf.schema()->ks_name(), cf.schema()->cf_name()));
    }
-    auto c = std::make_unique<resharding_compaction>(std::move(sstables), cf, std::move(creator), max_sstable_size, sstable_level);
+    auto c = std::make_unique<resharding_compaction>(std::move(sstables), cf, std::move(creator), max_sstable_size, sstable_level, tsg);
    return compaction::run(std::move(c));
 }

--- a/sstables/compaction.hh
+++ b/sstables/compaction.hh
@@ -112,13 +112,15 @@ namespace sstables {
    // cleaning operation, and compaction history will not be updated.
    future<std::vector<shared_sstable>> compact_sstables(std::vector<shared_sstable> sstables,
            column_family& cf, std::function<shared_sstable()> creator,
-            uint64_t max_sstable_size, uint32_t sstable_level, bool cleanup = false);
+            uint64_t max_sstable_size, uint32_t sstable_level, bool cleanup = false,
+            seastar::thread_scheduling_group* tsg = nullptr);

    // Compacts a set of N shared sstables into M sstables. For every shard involved,
    // i.e. which owns any of the sstables, a new unshared sstable is created.
    future<std::vector<shared_sstable>> reshard_sstables(std::vector<shared_sstable> sstables,
            column_family& cf, std::function<shared_sstable(shard_id)> creator,
-        uint64_t max_sstable_size, uint32_t sstable_level);
+        uint64_t max_sstable_size, uint32_t sstable_level,
+        seastar::thread_scheduling_group* tsg = nullptr);

    // Return the most interesting bucket applying the size-tiered strategy.
    std::vector<sstables::shared_sstable>
--- a/sstables/consumer.hh
+++ b/sstables/consumer.hh
@@ -316,6 +316,10 @@ public:
        return _stream_position;
    }

+    bool eof() const {
+        return _remain == 0;
+    }
+
    future<> close() {
        return _input.close();
    }
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -148,19 +148,19 @@ public:
    index_comparator(const schema& s) : _tri_cmp(s) {}

    bool operator()(const summary_entry& e, dht::ring_position_view rp) const {
-        return _tri_cmp(e.get_key(), rp) < 0;
+        return _tri_cmp(e.get_decorated_key(), rp) < 0;
    }

    bool operator()(const index_entry& e, dht::ring_position_view rp) const {
-        return _tri_cmp(e.get_key(), rp) < 0;
+        return _tri_cmp(e.get_decorated_key(), rp) < 0;
    }

    bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
-        return _tri_cmp(e.get_key(), rp) > 0;
+        return _tri_cmp(e.get_decorated_key(), rp) > 0;
    }

    bool operator()(dht::ring_position_view rp, const index_entry& e) const {
-        return _tri_cmp(e.get_key(), rp) > 0;
+        return _tri_cmp(e.get_decorated_key(), rp) > 0;
    }
 };

@@ -497,8 +497,8 @@ public:
                return make_ready_future<bool>(false);
            }
            return read_partition_data().then([this, key] {
-                dht::ring_position_comparator cmp(*_sstable->_schema);
-                return cmp(key, partition_key()) == 0;
+                index_comparator cmp(*_sstable->_schema);
+                return cmp(key, current_partition_entry()) == 0;
            });
        });
    }
--- a/sstables/key.hh
+++ b/sstables/key.hh
@@ -26,6 +26,7 @@
 #include "database_fwd.hh"
 #include "keys.hh"
 #include "compound_compat.hh"
+#include "dht/i_partitioner.hh"

 namespace sstables {

@@ -35,12 +36,12 @@ public:
    explicit key_view(bytes_view b) : _bytes(b) {}
    key_view() : _bytes() {}

-    std::vector<bytes> explode(const schema& s) const {
+    std::vector<bytes_view> explode(const schema& s) const {
        return composite_view(_bytes, s.partition_key_size() > 1).explode();
    }

    partition_key to_partition_key(const schema& s) const {
-        return partition_key::from_exploded(s, explode(s));
+        return partition_key::from_exploded_view(explode(s));
    }

    bool operator==(const key_view& k) const { return k._bytes == _bytes; }
@@ -105,10 +106,10 @@ public:
        return make_key(s, pk);
    }
    partition_key to_partition_key(const schema& s) const {
-        return partition_key::from_exploded(s, explode(s));
+        return partition_key::from_exploded_view(explode(s));
    }

-    std::vector<bytes> explode(const schema& s) const {
+    std::vector<bytes_view> explode(const schema& s) const {
        return composite_view(_bytes, is_compound(s)).explode();
    }

@@ -142,4 +143,20 @@ inline key maximum_key() {
    return key(key::kind::after_all_keys);
 };

+class decorated_key_view {
+    const dht::token& _token;
+    key_view _partition_key;
+public:
+    decorated_key_view(const dht::token& token, key_view partition_key) noexcept
+        : _token(token), _partition_key(partition_key) { }
+
+    const dht::token& token() const {
+        return _token;
+    }
+
+    key_view key() const {
+        return _partition_key;
+    }
+};
+
 }
--- a/sstables/partition.cc
+++ b/sstables/partition.cc
@@ -37,10 +37,10 @@

 namespace sstables {

-static inline bytes pop_back(std::vector<bytes>& vec) {
+static inline bytes_view pop_back(std::vector<bytes_view>& vec) {
    auto b = std::move(vec.back());
    vec.pop_back();
-    return std::move(b);
+    return b;
 }

 class sstable_streamed_mutation;
@@ -104,11 +104,11 @@ public:
    struct column {
        bool is_static;
        bytes_view col_name;
-        std::vector<bytes> clustering;
+        std::vector<bytes_view> clustering;
        // see is_collection. collections have an extra element aside from the name.
        // This will be non-zero size if this is a collection, and zero size othersize.
-        bytes collection_extra_data;
-        bytes cell;
+        bytes_view collection_extra_data;
+        bytes_view cell;
        const column_definition *cdef;
        bool is_present;

@@ -148,7 +148,7 @@ public:
            return col;
        }

-        std::vector<bytes> extract_clustering_key(const schema& schema) {
+        std::vector<bytes_view> extract_clustering_key(const schema& schema) {
            return composite_view(col_name, schema.is_compound()).explode();
        }
        column(const schema& schema, bytes_view col, api::timestamp_type timestamp)
@@ -157,7 +157,7 @@ public:
            , clustering(extract_clustering_key(schema))
            , collection_extra_data(is_collection(schema) ? pop_back(clustering) : bytes()) // collections are not supported with COMPACT STORAGE, so this is fine
            , cell(!schema.is_dense() ? pop_back(clustering) : (*(schema.regular_begin())).name()) // dense: cell name is not provided. It is the only regular column
-            , cdef(schema.get_column_definition(cell))
+            , cdef(schema.get_column_definition(to_bytes(cell)))
            , is_present(cdef && timestamp > cdef->dropped_at())
        {

@@ -168,12 +168,6 @@ public:
                    }
                }
            }
-            // See schema::prepare_dense_schema. We can, using v3 schemas, have columns we consider "static" without
-            // the table being compound, i.e. no clustering. We can ignore prefixes, but we still need to produce
-            // static for mutations.
-            if (cdef && !is_static && cdef->is_static() && schema.is_static_compact_table()) {
-                is_static = true;
-            }
            if (is_present && is_static != cdef->is_static()) {
                throw malformed_sstable_exception(seastar::format("Mismatch between {} cell and {} column definition",
                        is_static ? "static" : "non-static", cdef->is_static() ? "static" : "non-static"));
@@ -227,7 +221,7 @@ private:
        if (!_pending_collection || _pending_collection->is_new_collection(cdef)) {
            flush_pending_collection(*_schema);

-            if (!cdef->type->is_multi_cell()) {
+            if (!cdef->is_multi_cell()) {
                throw malformed_sstable_exception("frozen set should behave like a cell\n");
            }
            _pending_collection = collection_mutation(cdef);
@@ -403,12 +397,12 @@ public:
        return ret;
    }

-    proceed flush_if_needed(bool is_static, const exploded_clustering_prefix& ecp) {
+    proceed flush_if_needed(bool is_static, const std::vector<bytes_view>& ecp) {
        auto pos = [&] {
            if (is_static) {
                return position_in_partition(position_in_partition::static_row_tag_t());
            } else {
-                auto ck = clustering_key_prefix::from_clustering_prefix(*_schema, ecp);
+                auto ck = clustering_key_prefix::from_exploded_view(ecp);
                return position_in_partition(position_in_partition::clustering_row_tag_t(), std::move(ck));
            }
        }();
@@ -444,8 +438,9 @@ public:
            auto id_lo = in.read<int64_t>();
            auto clock = in.read<int64_t>();
            auto value = in.read<int64_t>();
-            ccb.add_shard(counter_shard(counter_id(utils::UUID(id_hi, id_lo)), value, clock));
+            ccb.add_maybe_unsorted_shard(counter_shard(counter_id(utils::UUID(id_hi, id_lo)), value, clock));
        }
+        ccb.sort_and_remove_duplicates();
        return ccb.build(timestamp);
    }

@@ -460,8 +455,7 @@ public:

        struct column col(*_schema, col_name, timestamp);

-        auto clustering_prefix = exploded_clustering_prefix(std::move(col.clustering));
-        auto ret = flush_if_needed(col.is_static, clustering_prefix);
+        auto ret = flush_if_needed(col.is_static, col.clustering);
        if (_skip_in_progress) {
            return ret;
        }
@@ -506,11 +500,11 @@ public:
            auto ac = make_atomic_cell(timestamp, value, ttl, expiration);

            bool is_multi_cell = col.collection_extra_data.size();
-            if (is_multi_cell != col.cdef->type->is_multi_cell()) {
+            if (is_multi_cell != col.cdef->is_multi_cell()) {
                return;
            }
            if (is_multi_cell) {
-                update_pending_collection(col.cdef, std::move(col.collection_extra_data), std::move(ac));
+                update_pending_collection(col.cdef, to_bytes(col.collection_extra_data), std::move(ac));
                return;
            }

@@ -535,8 +529,7 @@ public:
    }

    proceed consume_deleted_cell(column &col, int64_t timestamp, gc_clock::time_point ttl) {
-        auto clustering_prefix = exploded_clustering_prefix(std::move(col.clustering));
-        auto ret = flush_if_needed(col.is_static, clustering_prefix);
+        auto ret = flush_if_needed(col.is_static, col.clustering);
        if (_skip_in_progress) {
            return ret;
        }
@@ -553,12 +546,12 @@ public:
        auto ac = atomic_cell::make_dead(timestamp, ttl);

        bool is_multi_cell = col.collection_extra_data.size();
-        if (is_multi_cell != col.cdef->type->is_multi_cell()) {
+        if (is_multi_cell != col.cdef->is_multi_cell()) {
            return ret;
        }

        if (is_multi_cell) {
-            update_pending_collection(col.cdef, std::move(col.collection_extra_data), std::move(ac));
+            update_pending_collection(col.cdef, to_bytes(col.collection_extra_data), std::move(ac));
        } else if (col.is_static) {
            _in_progress->as_mutable_static_row().set_cell(*col.cdef, atomic_cell_or_collection(std::move(ac)));
        } else {
@@ -580,7 +573,7 @@ public:
            return proceed::yes;
        }
        auto key = composite_view(column::fix_static_name(*_schema, col_name)).explode();
-        auto ck = clustering_key_prefix::from_exploded(std::move(key));
+        auto ck = clustering_key_prefix::from_exploded_view(key);
        auto ret = flush_if_needed(std::move(ck));
        if (!_skip_in_progress) {
            _in_progress->as_mutable_clustering_row().apply(shadowable_tombstone(tombstone(deltime)));
@@ -636,9 +629,9 @@ public:
        // Still, it is enough to check if we're dealing with a collection, since any other tombstone
        // won't have a full clustering prefix (otherwise it isn't a range)
        if (start.size() <= _schema->clustering_key_size()) {
-            auto start_ck = clustering_key_prefix::from_exploded(std::move(start));
+            auto start_ck = clustering_key_prefix::from_exploded_view(start);
            auto start_kind = start_marker_to_bound_kind(start_col);
-            auto end = clustering_key_prefix::from_exploded(composite_view(column::fix_static_name(*_schema, end_col)).explode());
+            auto end = clustering_key_prefix::from_exploded_view(composite_view(column::fix_static_name(*_schema, end_col)).explode());
            auto end_kind = end_marker_to_bound_kind(end_col);
            if (range_tombstone::is_single_clustering_row_tombstone(*_schema, start_ck, start_kind, end, end_kind)) {
                auto ret = flush_if_needed(std::move(start_ck));
@@ -664,9 +657,9 @@ public:
            }
        } else {
            auto&& column = pop_back(start);
-            auto cdef = _schema->get_column_definition(column);
-            if (cdef && cdef->type->is_multi_cell() && deltime.marked_for_delete_at > cdef->dropped_at()) {
-                auto ret = flush_if_needed(cdef->is_static(), exploded_clustering_prefix(std::move(start)));
+            auto cdef = _schema->get_column_definition(to_bytes(column));
+            if (cdef && cdef->is_multi_cell() && deltime.marked_for_delete_at > cdef->dropped_at()) {
+                auto ret = flush_if_needed(cdef->is_static(), start);
                if (!_skip_in_progress) {
                    update_pending_collection(cdef, tombstone(deltime));
                }
@@ -841,12 +834,14 @@ public:
    sstable_streamed_mutation(sstable_streamed_mutation&&) = delete;

    virtual future<> fill_buffer() final override {
-        _ds->_consumer.push_ready_fragments();
-        if (is_buffer_full() || is_end_of_stream()) {
-            return make_ready_future<>();
-        }
-        return _ds->_consumer.maybe_skip().then([this] {
-            return _ds->_context.read();
+        return do_until([this] { return !is_buffer_empty() || is_end_of_stream(); }, [this] {
+            _ds->_consumer.push_ready_fragments();
+            if (is_buffer_full() || is_end_of_stream()) {
+                return make_ready_future<>();
+            }
+            return _ds->_consumer.maybe_skip().then([this] {
+                return _ds->_context.read();
+            });
        });
    }

@@ -1155,6 +1150,10 @@ future<> sstable_data_source::advance_to_next_partition() {

 future<streamed_mutation_opt> sstable_data_source::read_next_partition() {
    sstlog.trace("reader {}: read next partition", this);
+    if (!_read_enabled) {
+        sstlog.trace("reader {}: eof", this);
+        return make_ready_future<streamed_mutation_opt>();
+    }
    return advance_to_next_partition().then([this] {
        return read_partition();
    });
@@ -1179,7 +1178,7 @@ future<streamed_mutation_opt> sstable_data_source::read_partition() {
    // need to use the index anyway soon.
    //
    if (_index_in_current_partition) {
-        if (_lh_index->eof()) {
+        if (_context.eof()) {
            sstlog.trace("reader {}: eof", this);
            return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
        }
--- a/sstables/row.cc
+++ b/sstables/row.cc
@@ -422,6 +422,9 @@ public:
        _ctx->reset(el);
        return _ctx->skip_to(begin);
    }
+    bool eof() const {
+        return _ctx->eof();
+    }
 };

 data_consume_context::~data_consume_context() = default;
@@ -442,6 +445,9 @@ future<> data_consume_context::fast_forward_to(uint64_t begin, uint64_t end) {
 future<> data_consume_context::skip_to(indexable_element el, uint64_t begin) {
    return _pimpl->skip_to(el, begin);
 }
+bool data_consume_context::eof() const {
+    return _pimpl->eof();
+}

 data_consume_context sstable::data_consume_rows(
        row_consumer& consumer, sstable::disk_read_range toread, uint64_t last_end) {
--- a/sstables/shared_index_lists.hh
+++ b/sstables/shared_index_lists.hh
@@ -96,28 +96,34 @@ public:
    future<list_ptr> get_or_load(key_type key, Loader&& loader) {
        auto i = _lists.find(key);
        lw_shared_ptr<entry> e;
-        if (i != _lists.end()) {
-            e = i->second->shared_from_this();
-        } else {
-            ++_shard_stats.misses;
-            e = make_lw_shared<entry>(*this, key);
-            auto res = _lists.emplace(key, e.get());
-            assert(res.second);
-            loader(key).then_wrapped([e](future<index_list>&& f) mutable {
-                if (f.failed()) {
-                    e->loaded.set_exception(f.get_exception());
-                } else {
-                    e->list = f.get0();
-                    e->loaded.set_value();
-                }
-            });
-        }
-        future<> f = e->loaded.get_shared_future();
+        auto f = [&] {
+            if (i != _lists.end()) {
+                e = i->second->shared_from_this();
+                return e->loaded.get_shared_future();
+            } else {
+                ++_shard_stats.misses;
+                e = make_lw_shared<entry>(*this, key);
+                auto f = e->loaded.get_shared_future();
+                auto res = _lists.emplace(key, e.get());
+                assert(res.second);
+                futurize_apply(loader, key).then_wrapped([e](future<index_list>&& f) mutable {
+                    if (f.failed()) {
+                        e->loaded.set_exception(f.get_exception());
+                    } else {
+                        e->list = f.get0();
+                        e->loaded.set_value();
+                    }
+                });
+                return f;
+            }
+        }();
        if (!f.available()) {
            ++_shard_stats.blocks;
            return f.then([e]() mutable {
                return list_ptr(std::move(e));
            });
+        } else if (f.failed()) {
+            return make_exception_future<list_ptr>(std::move(f).get_exception());
        } else {
            ++_shard_stats.hits;
            return make_ready_future<list_ptr>(list_ptr(std::move(e)));
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -649,6 +649,7 @@ future<> parse(random_access_reader& in, summary& s) {
                    buf.trim_front(keysize);
                    // FIXME: This is a le read. We should make this explicit
                    entry.position = *(reinterpret_cast<const net::packed<uint64_t> *>(buf.get()));
+                    entry.token = dht::global_partitioner().get_token(entry.get_key());

                    return make_ready_future<>();
                });
@@ -987,7 +988,7 @@ future<> sstable::read_simple(T& component, const io_priority_class& pc) {
            auto f = make_checked_file(_read_error_handler, fi);
            auto r = make_lw_shared<file_random_access_reader>(std::move(f), size, sstable_buffer_size);
            auto fut = parse(*r, component);
-            return fut.finally([r = std::move(r)] {
+            return fut.finally([r] {
                return r->close();
            }).then([r] {});
        });
@@ -1266,6 +1267,18 @@ future<foreign_sstable_open_info> sstable::get_open_info() & {
    });
 }

+static composite::eoc bound_kind_to_start_marker(bound_kind start_kind) {
+    return start_kind == bound_kind::excl_start
+         ? composite::eoc::end
+         : composite::eoc::start;
+}
+
+static composite::eoc bound_kind_to_end_marker(bound_kind end_kind) {
+    return end_kind == bound_kind::excl_end
+         ? composite::eoc::start
+         : composite::eoc::end;
+}
+
 static void output_promoted_index_entry(bytes_ostream& promoted_index,
        const bytes& first_col,
        const bytes& last_col,
@@ -1322,8 +1335,9 @@ static bytes serialize_colname(const composite& clustering_key,
 // (which might be gone later).
 void sstable::maybe_flush_pi_block(file_writer& out,
        const composite& clustering_key,
-        const std::vector<bytes_view>& column_names) {
-    bytes colname = serialize_colname(clustering_key, column_names, composite::eoc::none);
+        const std::vector<bytes_view>& column_names,
+        composite::eoc marker) {
+    bytes colname = serialize_colname(clustering_key, column_names, marker);
    if (_pi_write.block_first_colname.empty()) {
        // This is the first column in the partition, or first column since we
        // closed a promoted-index block. Remember its name and position -
@@ -1355,7 +1369,9 @@ void sstable::maybe_flush_pi_block(file_writer& out,
                auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
                auto end = composite::from_clustering_element(*_pi_write.schemap, rt.end);
                write_range_tombstone(out,
-                        start, rt.start_kind, end, rt.end_kind, {}, rt.tomb);
+                        start, bound_kind_to_start_marker(rt.start_kind),
+                        end, bound_kind_to_end_marker(rt.end_kind),
+                        {}, rt.tomb);
            }
        }
        _pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
@@ -1440,11 +1456,20 @@ void sstable::write_cell(file_writer& out, atomic_cell_view cell, const column_d
        for (auto i = 0u; i < shard_count; i++) {
            write<int16_t>(out, std::numeric_limits<int16_t>::min() + i);
        }
-        for (auto&& s : ccv.shards()) {
+        auto write_shard = [&] (auto&& s) {
            auto uuid = s.id().to_uuid();
            write(out, int64_t(uuid.get_most_significant_bits()),
                  int64_t(uuid.get_least_significant_bits()),
                  int64_t(s.logical_clock()), int64_t(s.value()));
+        };
+        if (service::get_local_storage_service().cluster_supports_correct_counter_order()) {
+            for (auto&& s : ccv.shards()) {
+                write_shard(s);
+            }
+        } else {
+            for (auto&& s : ccv.shards_compatible_with_1_7_4()) {
+                write_shard(s);
+            }
        }

        _c_stats.update_max_local_deletion_time(std::numeric_limits<int>::max());
@@ -1533,24 +1558,18 @@ void sstable::write_row_tombstone(file_writer& out, const composite& key, const

 void sstable::write_range_tombstone(file_writer& out,
        const composite& start,
-        bound_kind start_kind,
+        composite::eoc start_marker,
        const composite& end,
-        bound_kind end_kind,
+        composite::eoc end_marker,
        std::vector<bytes_view> suffix,
        const tombstone t) {
    if (!t) {
        return;
    }

-    auto start_marker = start_kind == bound_kind::excl_start
-                      ? composite::eoc::end
-                      : composite::eoc::start;
    write_column_name(out, start, suffix, start_marker);
    column_mask mask = column_mask::range_tombstone;
    write(out, mask);
-    auto end_marker = end_kind == bound_kind::excl_end
-                    ? composite::eoc::start
-                    : composite::eoc::end;
    write_column_name(out, end, suffix, end_marker);
    write_deletion_time(out, t);
 }
@@ -1721,10 +1740,10 @@ static void prepare_compression(compression& c, const schema& schema) {
    c.init_full_checksum();
 }

-static void maybe_add_summary_entry(summary& s, bytes_view key, uint64_t offset) {
+static void maybe_add_summary_entry(summary& s, const dht::token& token,  bytes_view key, uint64_t offset) {
    // Maybe add summary entry into in-memory representation of summary file.
    if ((s.keys_written++ % s.header.min_index_interval) == 0) {
-        s.entries.push_back({ bytes(key.data(), key.size()), offset });
+        s.entries.push_back({ token, bytes(key.data(), key.size()), offset });
    }
 }

@@ -1830,6 +1849,7 @@ components_writer::components_writer(sstable& sst, const schema& s, file_writer&
    , _schema(s)
    , _out(out)
    , _index(index_file_writer(sst, pc))
+    , _index_needs_close(true)
    , _max_sstable_size(cfg.max_sstable_size)
    , _tombstone_written(false)
 {
@@ -1847,7 +1867,7 @@ void components_writer::consume_new_partition(const dht::decorated_key& dk) {

    _partition_key = key::from_partition_key(_schema, dk.key());

-    maybe_add_summary_entry(_sst._components->summary, bytes_view(*_partition_key), _index.offset());
+    maybe_add_summary_entry(_sst._components->summary, dk.token(), bytes_view(*_partition_key), _index.offset());
    _sst._components->filter->add(bytes_view(*_partition_key));
    _sst._collector.add_key(bytes_view(*_partition_key));

@@ -1915,9 +1935,11 @@ stop_iteration components_writer::consume(range_tombstone&& rt) {
    // already closed by rt.start, so the accumulator doesn't grow boundless.
    _sst._pi_write.tombstone_accumulator->apply(rt);
    auto start = composite::from_clustering_element(_schema, std::move(rt.start));
+    auto start_marker = bound_kind_to_start_marker(rt.start_kind);
    auto end = composite::from_clustering_element(_schema, std::move(rt.end));
-    _sst.maybe_flush_pi_block(_out, start, {});
-    _sst.write_range_tombstone(_out, std::move(start), rt.start_kind, std::move(end), rt.end_kind, {}, rt.tomb);
+    auto end_marker = bound_kind_to_end_marker(rt.end_kind);
+    _sst.maybe_flush_pi_block(_out, start, {}, start_marker);
+    _sst.write_range_tombstone(_out, std::move(start), start_marker, std::move(end), end_marker, {}, rt.tomb);
    return stop_iteration::no;
 }

@@ -1959,6 +1981,7 @@ stop_iteration components_writer::consume_end_of_partition() {
 void components_writer::consume_end_of_stream() {
    seal_summary(_sst._components->summary, std::move(_first_key), std::move(_last_key)); // what if there is only one partition? what if it is empty?

+    _index_needs_close = false;
    _index.close().get();

    if (_sst.has_component(sstable::component_type::CompressionInfo)) {
@@ -1970,6 +1993,16 @@ void components_writer::consume_end_of_stream() {
            _sst._schema, _sst.get_first_decorated_key(), _sst.get_last_decorated_key());
 }

+components_writer::~components_writer() {
+    if (_index_needs_close) {
+        try {
+            _index.close().get();
+        } catch (...) {
+            sstlog.error("components_writer failed to close file: {}", std::current_exception());
+        }
+    }
+}
+
 future<>
 sstable::read_scylla_metadata(const io_priority_class& pc) {
    if (_components->scylla_metadata) {
@@ -2090,7 +2123,9 @@ future<> sstable::write_components(::mutation_reader mr,
    if (cfg.replay_position) {
        _collector.set_replay_position(cfg.replay_position.value());
    }
-    return seastar::async([this, mr = std::move(mr), estimated_partitions, schema = std::move(schema), cfg, &pc] () mutable {
+    seastar::thread_attributes attr;
+    attr.scheduling_group = cfg.thread_scheduling_group;
+    return seastar::async(std::move(attr), [this, mr = std::move(mr), estimated_partitions, schema = std::move(schema), cfg, &pc] () mutable {
        auto wr = get_writer(*schema, estimated_partitions, cfg, pc);
        consume_flattened_in_thread(mr, wr);
    });
@@ -2112,7 +2147,8 @@ future<> sstable::generate_summary(const io_priority_class& pc) {
            return true;
        }
        void consume_entry(index_entry&& ie, uint64_t offset) {
-            maybe_add_summary_entry(_summary, ie.get_key_bytes(), offset);
+            auto token = dht::global_partitioner().get_token(ie.get_key());
+            maybe_add_summary_entry(_summary, token, ie.get_key_bytes(), offset);
            if (!first_key) {
                first_key = key(to_bytes(ie.get_key_bytes()));
            } else {
@@ -2763,7 +2799,8 @@ atomic_deletion_cancelled::what() const noexcept {
 thread_local shared_index_lists::stats shared_index_lists::_shard_stats;
 static thread_local seastar::metrics::metric_groups metrics;

-void init_metrics() {
+future<> init_metrics() {
+  return seastar::smp::invoke_on_all([] {
    namespace sm = seastar::metrics;
    metrics.add_group("sstables", {
        sm::make_derive("index_page_hits", [] { return shared_index_lists::shard_stats().hits; },
@@ -2773,6 +2810,7 @@ void init_metrics() {
        sm::make_derive("index_page_blocks", [] { return shared_index_lists::shard_stats().blocks; },
            sm::description("Index page requests which needed to wait due to page not being loaded yet")),
    });
+  });
 }

 struct range_reader_adaptor final : public ::mutation_reader::impl {
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -53,6 +53,10 @@
 #include "sstables/shared_index_lists.hh"
 #include "db/commitlog/replay_position.hh"

+namespace seastar {
+class thread_scheduling_group;
+}
+
 namespace sstables {

 extern logging::logger sstlog;
@@ -83,6 +87,7 @@ public:
    future<> fast_forward_to(uint64_t begin, uint64_t end);
    future<> skip_to(indexable_element, uint64_t begin);
    uint64_t position() const;
+    bool eof() const;
    // Define (as defaults) the destructor and move operations in the source
    // file, so here we don't need to know the incomplete impl type.
    ~data_consume_context();
@@ -131,6 +136,7 @@ struct sstable_writer_config {
    bool backup = false;
    bool leave_unsealed = false;
    stdx::optional<db::replay_position> replay_position;
+    seastar::thread_scheduling_group* thread_scheduling_group = nullptr;
 };

 class sstable : public enable_lw_shared_from_this<sstable> {
@@ -492,7 +498,8 @@ private:

    void maybe_flush_pi_block(file_writer& out,
            const composite& clustering_key,
-            const std::vector<bytes_view>& column_names);
+            const std::vector<bytes_view>& column_names,
+            composite::eoc marker = composite::eoc::none);

    schema_ptr _schema;
    sstring _dir;
@@ -597,9 +604,9 @@ private:
    void write_cell(file_writer& out, atomic_cell_view cell, const column_definition& cdef);
    void write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none);
    void write_column_name(file_writer& out, bytes_view column_names);
-    void write_range_tombstone(file_writer& out, const composite& start, bound_kind start_kind, const composite& end, bound_kind stop_kind, std::vector<bytes_view> suffix, const tombstone t);
+    void write_range_tombstone(file_writer& out, const composite& start, composite::eoc start_marker, const composite& end, composite::eoc end_marker, std::vector<bytes_view> suffix, const tombstone t);
    void write_range_tombstone(file_writer& out, const composite& start, const composite& end, std::vector<bytes_view> suffix, const tombstone t) {
-        write_range_tombstone(out, start, bound_kind::incl_start, end, bound_kind::incl_end, std::move(suffix), std::move(t));
+        write_range_tombstone(out, start, composite::eoc::start, end, composite::eoc::end, std::move(suffix), std::move(t));
    }
    void write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation_view collection);
    void write_row_tombstone(file_writer& out, const composite& key, const row_tombstone t);
@@ -611,6 +618,10 @@ public:

    future<> read_toc();

+    bool has_scylla_component() const {
+        return has_component(component_type::Scylla);
+    }
+
    bool filter_has_key(const key& key) {
        return _components->filter->is_present(bytes_view(key));
    }
@@ -766,6 +777,7 @@ class components_writer {
    const schema& _schema;
    file_writer& _out;
    file_writer _index;
+    bool _index_needs_close;
    uint64_t _max_sstable_size;
    bool _tombstone_written;
    // Remember first and last keys, which we need for the summary file.
@@ -781,6 +793,12 @@ private:
    }
 public:
    components_writer(sstable& sst, const schema& s, file_writer& out, uint64_t estimated_partitions, const sstable_writer_config&, const io_priority_class& pc);
+    ~components_writer();
+    components_writer(components_writer&& o) : _sst(o._sst), _schema(o._schema), _out(o._out), _index(std::move(o._index)),
+            _index_needs_close(o._index_needs_close), _max_sstable_size(o._max_sstable_size), _tombstone_written(o._tombstone_written),
+            _first_key(std::move(o._first_key)), _last_key(std::move(o._last_key)), _partition_key(std::move(o._partition_key)) {
+        o._index_needs_close = false;
+    }

    void consume_new_partition(const dht::decorated_key& dk);
    void consume(tombstone t);
@@ -840,6 +858,6 @@ struct sstable_open_info {
    file index;
 };

-void init_metrics();
+future<> init_metrics();

 }
--- a/Show More
+++ b/Show More