gossip: Fix tokens assignment in assassinate_endpoint

The tokens vector is defined a few lines above and is needed outsie the if block. Do not redefine it again in the if block, otherwise the tokens will be empty. Found by code inspection. Fixes #3551. Message-Id: <c7a06375c65c950e94236571127f533e5a60cbfd.1530002177.git.asias@scylladb.com> (cherry picked from commit c3b5a2ecd5)
locator::ec2_multi_region_snitch: don't call for ec2_snitch::gossiper_starting()
2018-06-27 12:01:19 +03:00 · 2018-06-12 19:02:48 +03:00 · 2018-05-24 12:02:15 +03:00 · 2018-05-24 11:14:20 +03:00 · 2018-05-24 11:08:13 +03:00 · 2018-05-24 15:24:29 +08:00
205 changed files with 7790 additions and 2670 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=2.0.4

 if test -f version
 then
--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -252,13 +252,13 @@ void set_cache_service(http_context& ctx, routes& r) {
        // In origin row size is the weighted size.
        // We currently do not support weights, so we use num entries instead
        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().num_entries();
+            return cf.get_row_cache().partitions();
        }, std::plus<uint64_t>());
    });

    cs::get_row_entries.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().num_entries();
+            return cf.get_row_cache().partitions();
        }, std::plus<uint64_t>());
    });

--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -114,7 +114,7 @@ struct hash<auth::authenticated_user> {

 class auth::auth::permissions_cache {
 public:
-    typedef utils::loading_cache<std::pair<authenticated_user, data_resource>, permission_set, utils::tuple_hash> cache_type;
+    typedef utils::loading_cache<std::pair<authenticated_user, data_resource>, permission_set, utils::loading_cache_reload_enabled::yes, utils::simple_entry_size<permission_set>, utils::tuple_hash> cache_type;
    typedef typename cache_type::key_type key_type;

    permissions_cache()
@@ -130,7 +130,7 @@ public:
                        }) {}

    future<> stop() {
-        return make_ready_future<>();
+        return _cache.stop();
    }

    future<permission_set> get(::shared_ptr<authenticated_user> user, data_resource resource) {
--- a/cache_streamed_mutation.hh
+++ b/cache_streamed_mutation.hh
@@ -69,6 +69,29 @@ public:
 };

 class cache_streamed_mutation final : public streamed_mutation::impl {
+    enum class state {
+        before_static_row,
+
+        // Invariants:
+        //  - position_range(_lower_bound, _upper_bound) covers all not yet emitted positions from current range
+        //  - _next_row points to the nearest row in cache >= _lower_bound
+        //  - _next_row_in_range = _next.position() < _upper_bound
+        reading_from_cache,
+
+        // Starts reading from underlying reader.
+        // The range to read is position_range(_lower_bound, min(_next_row.position(), _upper_bound)).
+        // Invariants:
+        //  - _next_row_in_range = _next.position() < _upper_bound
+        move_to_underlying,
+
+        // Invariants:
+        // - Upper bound of the read is min(_next_row.position(), _upper_bound)
+        // - _next_row_in_range = _next.position() < _upper_bound
+        // - _last_row_key contains the key of last emitted clustering_row
+        reading_from_underlying,
+
+        end_of_stream
+    };
    lw_shared_ptr<partition_snapshot> _snp;
    position_in_partition::tri_compare _position_cmp;

@@ -92,25 +115,24 @@ class cache_streamed_mutation final : public streamed_mutation::impl {
    position_in_partition _lower_bound;
    position_in_partition_view _upper_bound;

-    bool _static_row_done = false;
-    bool _reading_underlying = false;
+    state _state = state::before_static_row;
    lw_shared_ptr<read_context> _read_context;
    partition_snapshot_row_cursor _next_row;
    bool _next_row_in_range = false;

    future<> do_fill_buffer();
-    future<> copy_from_cache_to_buffer();
+    void copy_from_cache_to_buffer();
    future<> process_static_row();
    void move_to_end();
-    future<> move_to_next_range();
-    future<> move_to_current_range();
-    future<> move_to_next_entry();
+    void move_to_next_range();
+    void move_to_current_range();
+    void move_to_next_entry();
    // Emits all delayed range tombstones with positions smaller than upper_bound.
    void drain_tombstones(position_in_partition_view upper_bound);
    // Emits all delayed range tombstones.
    void drain_tombstones();
    void add_to_buffer(const partition_snapshot_row_cursor&);
-    void add_to_buffer(clustering_row&&);
+    void add_clustering_row_to_buffer(mutation_fragment&&);
    void add_to_buffer(range_tombstone&&);
    void add_to_buffer(mutation_fragment&&);
    future<> read_from_underlying();
@@ -154,12 +176,16 @@ public:
 inline
 future<> cache_streamed_mutation::process_static_row() {
    if (_snp->version()->partition().static_row_continuous()) {
-        row sr = _snp->static_row();
+        _read_context->cache().on_row_hit();
+        row sr = _lsa_manager.run_in_read_section([this] {
+            return _snp->static_row();
+        });
        if (!sr.empty()) {
            push_mutation_fragment(mutation_fragment(static_row(std::move(sr))));
        }
        return make_ready_future<>();
    } else {
+        _read_context->cache().on_row_miss();
        return _read_context->get_next_fragment().then([this] (mutation_fragment_opt&& sr) {
            if (sr) {
                assert(sr->is_static_row());
@@ -173,15 +199,24 @@ future<> cache_streamed_mutation::process_static_row() {

 inline
 future<> cache_streamed_mutation::fill_buffer() {
-    if (!_static_row_done) {
-        _static_row_done = true;
-        return process_static_row().then([this] {
-            return _lsa_manager.run_in_read_section([this] {
-                return move_to_current_range();
-            }).then([this] {
-                return fill_buffer();
+    if (_state == state::before_static_row) {
+        auto after_static_row = [this] {
+            if (_ck_ranges_curr == _ck_ranges_end) {
+                _end_of_stream = true;
+                _state = state::end_of_stream;
+                return make_ready_future<>();
+            }
+            _state = state::reading_from_cache;
+            _lsa_manager.run_in_read_section([this] {
+                move_to_current_range();
            });
-        });
+            return fill_buffer();
+        };
+        if (_schema->has_static_columns()) {
+            return process_static_row().then(std::move(after_static_row));
+        } else {
+            return after_static_row();
+        }
    }
    return do_until([this] { return _end_of_stream || is_buffer_full(); }, [this] {
        return do_fill_buffer();
@@ -190,18 +225,27 @@ future<> cache_streamed_mutation::fill_buffer() {

 inline
 future<> cache_streamed_mutation::do_fill_buffer() {
-    if (_reading_underlying) {
+    if (_state == state::move_to_underlying) {
+        _state = state::reading_from_underlying;
+        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
+                                      : position_in_partition(_upper_bound);
+        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}).then([this] {
+            return read_from_underlying();
+        });
+    }
+    if (_state == state::reading_from_underlying) {
        return read_from_underlying();
    }
+    // assert(_state == state::reading_from_cache)
    return _lsa_manager.run_in_read_section([this] {
        auto same_pos = _next_row.maybe_refresh();
        // FIXME: If continuity changed anywhere between _lower_bound and _next_row.position()
        // we need to redo the lookup with _lower_bound. There is no eviction yet, so not yet a problem.
        assert(same_pos);
-        while (!is_buffer_full() && !_end_of_stream && !_reading_underlying) {
-            future<> f = copy_from_cache_to_buffer();
-            if (!f.available() || need_preempt()) {
-                return f;
+        while (!is_buffer_full() && _state == state::reading_from_cache) {
+            copy_from_cache_to_buffer();
+            if (need_preempt()) {
+                break;
            }
        }
        return make_ready_future<>();
@@ -210,33 +254,34 @@ future<> cache_streamed_mutation::do_fill_buffer() {

 inline
 future<> cache_streamed_mutation::read_from_underlying() {
-    return do_until([this] { return !_reading_underlying || is_buffer_full(); }, [this] {
-        return _read_context->get_next_fragment().then([this] (auto&& mfopt) {
-            if (!mfopt) {
-                _reading_underlying = false;
-                return _lsa_manager.run_in_update_section([this] {
-                    auto same_pos = _next_row.maybe_refresh();
-                    assert(same_pos); // FIXME: handle eviction
-                    if (_next_row_in_range) {
+    return consume_mutation_fragments_until(_read_context->get_streamed_mutation(),
+        [this] { return _state != state::reading_from_underlying || is_buffer_full(); },
+        [this] (mutation_fragment mf) {
+            _read_context->cache().on_row_miss();
+            maybe_add_to_cache(mf);
+            add_to_buffer(std::move(mf));
+        },
+        [this] {
+            _state = state::reading_from_cache;
+            _lsa_manager.run_in_update_section([this] {
+                auto same_pos = _next_row.maybe_refresh();
+                assert(same_pos); // FIXME: handle eviction
+                if (_next_row_in_range) {
+                    maybe_update_continuity();
+                    add_to_buffer(_next_row);
+                    move_to_next_entry();
+                } else {
+                    if (no_clustering_row_between(*_schema, _upper_bound, _next_row.position())) {
                        this->maybe_update_continuity();
-                        this->add_to_buffer(_next_row);
-                        return this->move_to_next_entry();
                    } else {
-                        if (no_clustering_row_between(*_schema, _upper_bound, _next_row.position())) {
-                            this->maybe_update_continuity();
-                        } else {
-                            // FIXME: Insert dummy entry at _upper_bound.
-                        }
-                        return this->move_to_next_range();
+                        // FIXME: Insert dummy entry at _upper_bound.
+                        _read_context->cache().on_mispopulate();
                    }
-                });
-            } else {
-                this->maybe_add_to_cache(*mfopt);
-                this->add_to_buffer(std::move(*mfopt));
-                return make_ready_future<>();
-            }
+                    move_to_next_range();
+                }
+            });
+            return make_ready_future<>();
        });
-    });
 }

 inline
@@ -249,6 +294,8 @@ void cache_streamed_mutation::maybe_update_continuity() {
        } else if (!_ck_ranges_curr->start()) {
            _next_row.set_continuous(true);
        }
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

@@ -266,6 +313,7 @@ void cache_streamed_mutation::maybe_add_to_cache(const mutation_fragment& mf) {
 inline
 void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
    if (!can_populate()) {
+        _read_context->cache().on_mispopulate();
        return;
    }
    _lsa_manager.run_in_update_section_with_allocator([this, &cr] {
@@ -281,10 +329,11 @@ void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
        auto new_entry = alloc_strategy_unique_ptr<rows_entry>(
            current_allocator().construct<rows_entry>(cr.key(), cr.tomb(), cr.marker(), cr.cells()));
        new_entry->set_continuous(false);
-        auto it = _next_row.has_up_to_date_row_from_latest_version()
+        auto it = _next_row.has_valid_row_from_latest_version()
                  ? _next_row.get_iterator_in_latest_version() : mp.clustered_rows().lower_bound(cr.key(), less);
        auto insert_result = mp.clustered_rows().insert_check(it, *new_entry, less);
        if (insert_result.second) {
+            _read_context->cache().on_row_insert();
            new_entry.release();
        }
        it = insert_result.first;
@@ -294,11 +343,12 @@ void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
            if (it == mp.clustered_rows().begin()) {
                // FIXME: check whether entry for _last_row_key is in older versions and if so set
                // continuity to true.
+                _read_context->cache().on_mispopulate();
            } else {
                auto prev_it = it;
                --prev_it;
-                clustering_key_prefix::tri_compare tri_comp(*_schema);
-                if (tri_comp(*_last_row_key, prev_it->key()) == 0) {
+                clustering_key_prefix::equality eq(*_schema);
+                if (eq(*_last_row_key, prev_it->key())) {
                    e.set_continuous(true);
                }
            }
@@ -306,6 +356,7 @@ void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
            e.set_continuous(true);
        } else {
            // FIXME: Insert dummy entry at _ck_ranges_curr->start()
+            _read_context->cache().on_mispopulate();
        }
    });
 }
@@ -317,26 +368,24 @@ bool cache_streamed_mutation::after_current_range(position_in_partition_view p)

 inline
 future<> cache_streamed_mutation::start_reading_from_underlying() {
-    _reading_underlying = true;
-    auto end = _next_row_in_range ? position_in_partition(_next_row.position())
-                                  : position_in_partition(_upper_bound);
-    return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)});
+    _state = state::move_to_underlying;
+    return make_ready_future<>();
 }

 inline
-future<> cache_streamed_mutation::copy_from_cache_to_buffer() {
+void cache_streamed_mutation::copy_from_cache_to_buffer() {
    position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key());
    for (auto&& rts : _snp->range_tombstones(*_schema, _lower_bound, _next_row_in_range ? next_lower_bound : _upper_bound)) {
        add_to_buffer(std::move(rts));
        if (is_buffer_full()) {
-            return make_ready_future<>();
+            return;
        }
    }
    if (_next_row_in_range) {
        add_to_buffer(_next_row);
-        return move_to_next_entry();
+        move_to_next_entry();
    } else {
-        return move_to_next_range();
+        move_to_next_range();
    }
 }

@@ -344,47 +393,45 @@ inline
 void cache_streamed_mutation::move_to_end() {
    drain_tombstones();
    _end_of_stream = true;
+    _state = state::end_of_stream;
 }

 inline
-future<> cache_streamed_mutation::move_to_next_range() {
+void cache_streamed_mutation::move_to_next_range() {
    ++_ck_ranges_curr;
    if (_ck_ranges_curr == _ck_ranges_end) {
        move_to_end();
-        return make_ready_future<>();
    } else {
-        return move_to_current_range();
+        move_to_current_range();
    }
 }

 inline
-future<> cache_streamed_mutation::move_to_current_range() {
+void cache_streamed_mutation::move_to_current_range() {
    _last_row_key = std::experimental::nullopt;
    _lower_bound = position_in_partition::for_range_start(*_ck_ranges_curr);
    _upper_bound = position_in_partition_view::for_range_end(*_ck_ranges_curr);
    auto complete_until_next = _next_row.advance_to(_lower_bound) || _next_row.continuous();
    _next_row_in_range = !after_current_range(_next_row.position());
    if (!complete_until_next) {
-        return start_reading_from_underlying();
+        start_reading_from_underlying();
    }
-    return make_ready_future<>();
 }

 // _next_row must be inside the range.
 inline
-future<> cache_streamed_mutation::move_to_next_entry() {
+void cache_streamed_mutation::move_to_next_entry() {
    if (no_clustering_row_between(*_schema, _next_row.position(), _upper_bound)) {
-        return move_to_next_range();
+        move_to_next_range();
    } else {
        if (!_next_row.next()) {
            move_to_end();
-            return make_ready_future<>();
+            return;
        }
        _next_row_in_range = !after_current_range(_next_row.position());
        if (!_next_row.continuous()) {
-            return start_reading_from_underlying();
+            start_reading_from_underlying();
        }
-        return make_ready_future<>();
    }
 }

@@ -405,7 +452,7 @@ void cache_streamed_mutation::drain_tombstones() {
 inline
 void cache_streamed_mutation::add_to_buffer(mutation_fragment&& mf) {
    if (mf.is_clustering_row()) {
-        add_to_buffer(std::move(std::move(mf).as_clustering_row()));
+        add_clustering_row_to_buffer(std::move(mf));
    } else {
        assert(mf.is_range_tombstone());
        add_to_buffer(std::move(mf).as_range_tombstone());
@@ -415,16 +462,18 @@ void cache_streamed_mutation::add_to_buffer(mutation_fragment&& mf) {
 inline
 void cache_streamed_mutation::add_to_buffer(const partition_snapshot_row_cursor& row) {
    if (!row.dummy()) {
-        add_to_buffer(row.row());
+        _read_context->cache().on_row_hit();
+        add_clustering_row_to_buffer(row.row());
    }
 }

 inline
-void cache_streamed_mutation::add_to_buffer(clustering_row&& row) {
+void cache_streamed_mutation::add_clustering_row_to_buffer(mutation_fragment&& mf) {
+    auto& row = mf.as_clustering_row();
    drain_tombstones(row.position());
    _last_row_key = row.key();
    _lower_bound = position_in_partition::after_key(row.key());
-    push_mutation_fragment(std::move(row));
+    push_mutation_fragment(std::move(mf));
 }

 inline
@@ -442,17 +491,22 @@ inline
 void cache_streamed_mutation::maybe_add_to_cache(const range_tombstone& rt) {
    if (can_populate()) {
        _lsa_manager.run_in_update_section_with_allocator([&] {
-            _snp->version()->partition().apply_row_tombstone(*_schema, rt);
+            _snp->version()->partition().row_tombstones().apply_monotonically(*_schema, rt);
        });
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

 inline
 void cache_streamed_mutation::maybe_add_to_cache(const static_row& sr) {
    if (can_populate()) {
+        _read_context->cache().on_row_insert();
        _lsa_manager.run_in_update_section_with_allocator([&] {
            _snp->version()->partition().static_row().apply(*_schema, column_kind::static_column, sr.cells());
        });
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

@@ -460,6 +514,8 @@ inline
 void cache_streamed_mutation::maybe_set_static_row_continuous() {
    if (can_populate()) {
        _snp->version()->partition().set_static_row_continuous(true);
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -43,10 +43,14 @@ private:
    bool advance_to_next_range() {
        _in_current = false;
        if (!_current_start.is_static_row()) {
+            if (_current == _end) {
+                return false;
+            }
            ++_current;
        }
        ++_change_counter;
        if (_current == _end) {
+            _current_end = _current_start = position_in_partition_view::after_all_clustered_rows();
            return false;
        }
        _current_start = position_in_partition_view::for_range_start(*_current);
@@ -61,11 +65,18 @@ public:
        , _end(ranges.end())
        , _in_current(with_static_row)
        , _with_static_row(with_static_row)
-        , _current_start(with_static_row ? position_in_partition_view::for_static_row()
-                                         : position_in_partition_view::for_range_start(*_current))
-        , _current_end(with_static_row ? position_in_partition_view::before_all_clustered_rows()
-                                       : position_in_partition_view::for_range_end(*_current))
-    { }
+        , _current_start(position_in_partition_view::for_static_row())
+        , _current_end(position_in_partition_view::before_all_clustered_rows())
+    {
+        if (!with_static_row) {
+            if (_current == _end) {
+                _current_start = position_in_partition_view::before_all_clustered_rows();
+            } else {
+                _current_start = position_in_partition_view::for_range_start(*_current);
+                _current_end = position_in_partition_view::for_range_end(*_current);
+            }
+        }
+    }
    clustering_ranges_walker(clustering_ranges_walker&& o) noexcept
        : _schema(o._schema)
        , _ranges(o._ranges)
@@ -94,10 +105,6 @@ public:
    void trim_front(position_in_partition pos) {
        position_in_partition::less_compare less(_schema);

-        if (_current == _end) {
-            return;
-        }
-
        do {
            if (!less(_current_start, pos)) {
                break;
@@ -118,10 +125,6 @@ public:
    bool advance_to(position_in_partition_view pos) {
        position_in_partition::less_compare less(_schema);

-        if (_current == _end) {
-            return false;
-        }
-
        do {
            if (!_in_current && less(pos, _current_start)) {
                break;
@@ -146,12 +149,8 @@ public:
    bool advance_to(position_in_partition_view start, position_in_partition_view end) {
        position_in_partition::less_compare less(_schema);

-        if (_current == _end) {
-            return false;
-        }
-
        do {
-            if (less(end, _current_start)) {
+            if (!less(_current_start, end)) {
                break;
            }
            if (less(start, _current_end)) {
@@ -192,7 +191,7 @@ public:

    // Returns true if advanced past all contained positions. Any later advance_to() until reset() will return false.
    bool out_of_range() const {
-        return _current == _end;
+        return !_in_current && _current == _end;
    }

    // Resets the state of the walker so that advance_to() can be now called for new sequence of positions.
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -241,7 +241,7 @@ public:
    using component_view = std::pair<bytes_view, eoc>;
 private:
    template<typename Value, typename = std::enable_if_t<!std::is_same<const data_value, std::decay_t<Value>>::value>>
-    static size_t size(Value& val) {
+    static size_t size(const Value& val) {
        return val.size();
    }
    static size_t size(const data_value& val) {
@@ -445,17 +445,16 @@ public:
        return _is_compound;
    }

-    // The following factory functions assume this composite is a compound value.
    template <typename ClusteringElement>
    static composite from_clustering_element(const schema& s, const ClusteringElement& ce) {
-        return serialize_value(ce.components(s));
+        return serialize_value(ce.components(s), s.is_compound());
    }

-    static composite from_exploded(const std::vector<bytes_view>& v, eoc marker = eoc::none) {
+    static composite from_exploded(const std::vector<bytes_view>& v, bool is_compound, eoc marker = eoc::none) {
        if (v.size() == 0) {
-            return composite(bytes(size_t(1), bytes::value_type(marker)));
+            return composite(bytes(size_t(1), bytes::value_type(marker)), is_compound);
        }
-        return serialize_value(v, true, marker);
+        return serialize_value(v, is_compound, marker);
    }

    static composite static_prefix(const schema& s) {
@@ -499,14 +498,15 @@ public:
            , _is_compound(true)
    { }

-    std::vector<bytes> explode() const {
+    std::vector<bytes_view> explode() const {
        if (!_is_compound) {
-            return { to_bytes(_bytes) };
+            return { _bytes };
        }

-        std::vector<bytes> ret;
+        std::vector<bytes_view> ret;
+        ret.reserve(8);
        for (auto it = begin(), e = end(); it != e; ) {
-            ret.push_back(to_bytes(it->first));
+            ret.push_back(it->first);
            auto marker = it->second;
            ++it;
            if (it != e && marker != composite::eoc::none) {
--- a/configure.py
+++ b/configure.py
@@ -34,7 +34,7 @@ for line in open('/etc/os-release'):
        os_ids += value.split(' ')

 # distribution "internationalization", converting package names.
-# Fedora name is key, values is distro -> package name dict. 
+# Fedora name is key, values is distro -> package name dict.
 i18n_xlat = {
    'boost-devel': {
        'debian': 'libboost-dev',
@@ -48,7 +48,7 @@ def pkgname(name):
        for id in os_ids:
            if id in dict:
                return dict[id]
-    return name 
+    return name

 def get_flags():
    with open('/proc/cpuinfo') as f:
@@ -175,6 +175,8 @@ scylla_tests = [
    'tests/keys_test',
    'tests/partitioner_test',
    'tests/frozen_mutation_test',
+    'tests/serialized_action_test',
+    'tests/clustering_ranges_walker_test',
    'tests/perf/perf_mutation',
    'tests/lsa_async_eviction_test',
    'tests/lsa_sync_eviction_test',
@@ -236,6 +238,7 @@ scylla_tests = [
    'tests/view_schema_test',
    'tests/counter_test',
    'tests/cell_locker_test',
+    'tests/loading_cache_test',
 ]

 apps = [
@@ -290,6 +293,8 @@ add_tristate(arg_parser, name = 'hwloc', dest = 'hwloc', help = 'hwloc support')
 add_tristate(arg_parser, name = 'xen', dest = 'xen', help = 'Xen support')
 arg_parser.add_argument('--enable-gcc6-concepts', dest='gcc6_concepts', action='store_true', default=False,
                        help='enable experimental support for C++ Concepts as implemented in GCC 6')
+arg_parser.add_argument('--enable-alloc-failure-injector', dest='alloc_failure_injector', action='store_true', default=False,
+                        help='enable allocation failure injection')
 args = arg_parser.parse_args()

 defines = []
@@ -640,7 +645,7 @@ for t in tests_not_using_seastar_test_framework:
 for t in scylla_tests:
    deps[t] = [t + '.cc']
    if t not in tests_not_using_seastar_test_framework:
-        deps[t] += scylla_tests_dependencies 
+        deps[t] += scylla_tests_dependencies
        deps[t] += scylla_tests_seastar_deps
    else:
        deps[t] += scylla_core + api + idls + ['tests/cql_test_env.cc']
@@ -726,6 +731,9 @@ if not try_compile(compiler=args.cxx, source='''\
    print('Installed boost version too old.  Please update {}.'.format(pkgname("boost-devel")))
    sys.exit(1)

+
+has_sanitize_address_use_after_scope = try_compile(compiler=args.cxx, flags=['-fsanitize-address-use-after-scope'], source='int f() {}')
+
 defines = ' '.join(['-D' + d for d in defines])

 globals().update(vars(args))
@@ -760,6 +768,8 @@ if args.staticboost:
    seastar_flags += ['--static-boost']
 if args.gcc6_concepts:
    seastar_flags += ['--enable-gcc6-concepts']
+if args.alloc_failure_injector:
+    seastar_flags += ['--enable-alloc-failure-injector']

 seastar_cflags = args.user_cflags + " -march=nehalem"
 seastar_flags += ['--compiler', args.cxx, '--c-compiler', args.cc, '--cflags=%s' % (seastar_cflags)]
@@ -857,7 +867,7 @@ with open(buildfile, 'w') as f:
        f.write(textwrap.dedent('''\
            cxxflags_{mode} = -I. -I $builddir/{mode}/gen -I seastar -I seastar/build/{mode}/gen
            rule cxx.{mode}
-              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} -c -o $out $in
+              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags -c -o $out $in
              description = CXX $out
              depfile = $out.d
            rule link.{mode}
@@ -875,7 +885,16 @@ with open(buildfile, 'w') as f:
                command = thrift -gen cpp:cob_style -out $builddir/{mode}/gen $in
                description = THRIFT $in
            rule antlr3.{mode}
-                command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in && antlr3 $builddir/{mode}/gen/$in && sed -i 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' build/{mode}/gen/${{stem}}Parser.cpp
+                # We replace many local `ExceptionBaseType* ex` variables with a single function-scope one.
+                # Because we add such a variable to every function, and because `ExceptionBaseType` is not a global
+                # name, we also add a global typedef to avoid compilation errors. 
+                command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in $
+                     && antlr3 $builddir/{mode}/gen/$in $
+                     && sed -i -e 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' $
+                        -e '1i using ExceptionBaseType = int;' $
+                        -e 's/^{{/{{ ExceptionBaseType\* ex = nullptr;/; $
+                            s/ExceptionBaseType\* ex = new/ex = new/' $
+                        build/{mode}/gen/${{stem}}Parser.cpp
                description = ANTLR3 $in
            ''').format(mode = mode, **modeval))
        f.write('build {mode}: phony {artifacts}\n'.format(mode = mode,
@@ -918,7 +937,7 @@ with open(buildfile, 'w') as f:
                if binary.startswith('tests/'):
                    local_libs = '$libs'
                    if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
-                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework') 
+                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
                    if has_thrift:
                        local_libs += ' ' + thrift_libs + ' ' + maybe_static(args.staticboost, '-lboost_system')
                    # Our code's debugging information is huge, and multiplied
@@ -992,6 +1011,9 @@ with open(buildfile, 'w') as f:
            for cc in grammar.sources('$builddir/{}/gen'.format(mode)):
                obj = cc.replace('.cpp', '.o')
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
+                if cc.endswith('Parser.cpp') and has_sanitize_address_use_after_scope:
+                    # Parsers end up using huge amounts of stack space and overflowing their stack 
+                    f.write('  obj_cxxflags = -fno-sanitize-address-use-after-scope\n')
        f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune seastar/build/{mode}/gen/http/request_parser.hh seastar/build/{mode}/gen/http/http_response_parser.hh: ninja {seastar_deps}\n'
                .format(**locals()))
        f.write('  pool = seastar_pool\n')
--- a/counters.cc
+++ b/counters.cc
@@ -29,6 +29,15 @@ counter_id counter_id::local()
    return counter_id(service::get_local_storage_service().get_local_id());
 }

+bool counter_id::less_compare_1_7_4::operator()(const counter_id& a, const counter_id& b) const
+{
+    if (a._most_significant != b._most_significant) {
+        return a._most_significant < b._most_significant;
+    } else {
+        return a._least_significant < b._least_significant;
+    }
+}
+
 std::ostream& operator<<(std::ostream& os, const counter_id& id) {
    return os << id.to_uuid();
 }
@@ -42,6 +51,33 @@ std::ostream& operator<<(std::ostream& os, counter_cell_view ccv) {
    return os << "{counter_cell timestamp: " << ccv.timestamp() << " shards: {" << ::join(", ", ccv.shards()) << "}}";
 }

+void counter_cell_builder::do_sort_and_remove_duplicates()
+{
+    boost::range::sort(_shards, [] (auto& a, auto& b) { return a.id() < b.id(); });
+
+    std::vector<counter_shard> new_shards;
+    new_shards.reserve(_shards.size());
+    for (auto& cs : _shards) {
+        if (new_shards.empty() || new_shards.back().id() != cs.id()) {
+            new_shards.emplace_back(cs);
+        } else {
+            new_shards.back().apply(cs);
+        }
+    }
+    _shards = std::move(new_shards);
+    _sorted = true;
+}
+
+std::vector<counter_shard> counter_cell_view::shards_compatible_with_1_7_4() const
+{
+    auto sorted_shards = boost::copy_range<std::vector<counter_shard>>(shards());
+    counter_id::less_compare_1_7_4 cmp;
+    boost::range::sort(sorted_shards, [&] (auto& a, auto& b) {
+        return cmp(a.id(), b.id());
+    });
+    return sorted_shards;
+}
+
 static bool apply_in_place(atomic_cell_or_collection& dst, atomic_cell_or_collection& src)
 {
    auto dst_ccmv = counter_cell_mutable_view(dst.as_mutable_atomic_cell());
--- a/counters.hh
+++ b/counters.hh
@@ -36,6 +36,10 @@ class counter_id {
    int64_t _least_significant;
    int64_t _most_significant;
 public:
+    static_assert(std::is_same<decltype(std::declval<utils::UUID>().get_least_significant_bits()), int64_t>::value
+            &&  std::is_same<decltype(std::declval<utils::UUID>().get_most_significant_bits()), int64_t>::value,
+        "utils::UUID is expected to work with two signed 64-bit integers");
+
    counter_id() = default;
    explicit counter_id(utils::UUID uuid) noexcept
        : _least_significant(uuid.get_least_significant_bits())
@@ -49,12 +53,20 @@ public:
    bool operator<(const counter_id& other) const {
        return to_uuid() < other.to_uuid();
    }
+    bool operator>(const counter_id& other) const {
+        return other.to_uuid() < to_uuid();
+    }
    bool operator==(const counter_id& other) const {
        return to_uuid() == other.to_uuid();
    }
    bool operator!=(const counter_id& other) const {
        return !(*this == other);
    }
+public:
+    // (Wrong) Counter ID ordering used by Scylla 1.7.4 and earlier.
+    struct less_compare_1_7_4 {
+        bool operator()(const counter_id& a, const counter_id& b) const;
+    };
 public:
    static counter_id local();

@@ -139,6 +151,22 @@ private:
    static void write(const T& value, bytes::iterator& out) {
        out = std::copy_n(reinterpret_cast<const signed char*>(&value), sizeof(T), out);
    }
+private:
+    // Shared logic for applying counter_shards and counter_shard_views.
+    // T is either counter_shard or basic_counter_shard_view<U>.
+    template<typename T>
+    GCC6_CONCEPT(requires requires(T shard) {
+        { shard.value() } -> int64_t;
+        { shard.logical_clock() } -> int64_t;
+    })
+    counter_shard& do_apply(T&& other) noexcept {
+        auto other_clock = other.logical_clock();
+        if (_logical_clock < other_clock) {
+            _logical_clock = other_clock;
+            _value = other.value();
+        }
+        return *this;
+    }
 public:
    counter_shard(counter_id id, int64_t value, int64_t logical_clock) noexcept
        : _id(id)
@@ -163,12 +191,11 @@ public:
    }

    counter_shard& apply(counter_shard_view other) noexcept {
-        auto other_clock = other.logical_clock();
-        if (_logical_clock < other_clock) {
-            _logical_clock = other_clock;
-            _value = other.value();
-        }
-        return *this;
+        return do_apply(other);
+    }
+
+    counter_shard& apply(const counter_shard& other) noexcept {
+        return do_apply(other);
    }

    static size_t serialized_size() {
@@ -183,6 +210,9 @@ public:

 class counter_cell_builder {
    std::vector<counter_shard> _shards;
+    bool _sorted = true;
+private:
+    void do_sort_and_remove_duplicates();
 public:
    counter_cell_builder() = default;
    counter_cell_builder(size_t shard_count) {
@@ -193,6 +223,21 @@ public:
        _shards.emplace_back(cs);
    }

+    void add_maybe_unsorted_shard(const counter_shard& cs) {
+        add_shard(cs);
+        if (_sorted && _shards.size() > 1) {
+            auto current = _shards.rbegin();
+            auto previous = std::next(current);
+            _sorted = current->id() > previous->id();
+        }
+    }
+
+    void sort_and_remove_duplicates() {
+        if (!_sorted) {
+            do_sort_and_remove_duplicates();
+        }
+    }
+
    size_t serialized_size() const {
        return _shards.size() * counter_shard::serialized_size();
    }
@@ -339,6 +384,9 @@ public:
 struct counter_cell_view : basic_counter_cell_view<bytes_view> {
    using basic_counter_cell_view::basic_counter_cell_view;

+    // Returns counter shards in an order that is compatible with Scylla 1.7.4.
+    std::vector<counter_shard> shards_compatible_with_1_7_4() const;
+
    // Reversibly applies two counter cells, at least one of them must be live.
    // Returns true iff dst was modified.
    static bool apply_reversibly(atomic_cell_or_collection& dst, atomic_cell_or_collection& src);
--- a/cpu_controller.hh
+++ b/cpu_controller.hh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <seastar/core/thread.hh>
+#include <seastar/core/timer.hh>
+#include <chrono>
+
+// Simple proportional controller to adjust shares of memtable/streaming flushes.
+//
+// Goal is to flush as fast as we can, but not so fast that we steal all the CPU from incoming
+// requests, and at the same time minimize user-visible fluctuations in the flush quota.
+//
+// What that translates to is we'll try to keep virtual dirty's firt derivative at 0 (IOW, we keep
+// virtual dirty constant), which means that the rate of incoming writes is equal to the rate of
+// flushed bytes.
+//
+// The exact point at which the controller stops determines the desired flush CPU usage. As we
+// approach the hard dirty limit, we need to be more aggressive. We will therefore define two
+// thresholds, and increase the constant as we cross them.
+//
+//  1) the soft limit line
+//  2) halfway between soft limit and dirty limit
+//
+// The constants q1 and q2 are used to determine the proportional factor at each stage.
+//
+// Below the soft limit, we are in no particular hurry to flush, since it means we're set to
+// complete flushing before we a new memtable is ready. The quota is dirty * q1, and q1 is set to a
+// low number.
+//
+// The first half of the virtual dirty region is where we expect to be usually, so we have a low
+// slope corresponding to a sluggish response between q1 * soft_limit and q2.
+//
+// In the second half, we're getting close to the hard dirty limit so we increase the slope and
+// become more responsive, up to a maximum quota of qmax.
+//
+// For now we'll just set them in the structure not to complicate the constructor. But q1, q2 and
+// qmax can easily become parameters if we find another user.
+class flush_cpu_controller {
+    static constexpr float hard_dirty_limit = 0.50;
+    static constexpr float q1 = 0.01;
+    static constexpr float q2 = 0.2;
+    static constexpr float qmax = 1;
+
+    float _current_quota = 0.0f;
+    float _goal;
+    std::function<float()> _current_dirty;
+    std::chrono::milliseconds _interval;
+    timer<> _update_timer;
+
+    seastar::thread_scheduling_group _scheduling_group;
+    seastar::thread_scheduling_group *_current_scheduling_group = nullptr;
+
+    void adjust();
+public:
+    seastar::thread_scheduling_group* scheduling_group() {
+        return _current_scheduling_group;
+    }
+    float current_quota() const {
+        return _current_quota;
+    }
+
+    struct disabled {
+        seastar::thread_scheduling_group *backup;
+    };
+    flush_cpu_controller(disabled d) : _scheduling_group(std::chrono::nanoseconds(0), 0), _current_scheduling_group(d.backup) {}
+    flush_cpu_controller(std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty);
+    flush_cpu_controller(flush_cpu_controller&&) = default;
+};
+
+
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -1550,6 +1550,8 @@ basic_unreserved_keyword returns [sstring str]
        | K_DISTINCT
        | K_CONTAINS
        | K_STATIC
+        | K_FROZEN
+        | K_TUPLE
        | K_FUNCTION
        | K_AGGREGATE
        | K_SFUNC
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -75,6 +75,10 @@ functions::init() {
    declare(aggregate_fcts::make_max_function<double>());
    declare(aggregate_fcts::make_min_function<double>());

+    declare(aggregate_fcts::make_count_function<sstring>());
+    declare(aggregate_fcts::make_max_function<sstring>());
+    declare(aggregate_fcts::make_min_function<sstring>());
+
    //FIXME:
    //declare(aggregate_fcts::make_count_function<bytes>());
    //declare(aggregate_fcts::make_max_function<bytes>());
--- a/cql3/prepared_statements_cache.hh
+++ b/cql3/prepared_statements_cache.hh
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "utils/loading_cache.hh"
+#include "cql3/statements/prepared_statement.hh"
+
+namespace cql3 {
+
+using prepared_cache_entry = std::unique_ptr<statements::prepared_statement>;
+
+struct prepared_cache_entry_size {
+    size_t operator()(const prepared_cache_entry& val) {
+        // TODO: improve the size approximation
+        return 10000;
+    }
+};
+
+typedef bytes cql_prepared_id_type;
+typedef int32_t thrift_prepared_id_type;
+
+/// \brief The key of the prepared statements cache
+///
+/// We are going to store the CQL and Thrift prepared statements in the same cache therefore we need generate the key
+/// that is going to be unique in both cases. Thrift use int32_t as a prepared statement ID, CQL - MD5 digest.
+///
+/// We are going to use an std::pair<CQL_PREP_ID_TYPE, int64_t> as a key. For CQL statements we will use {CQL_PREP_ID, std::numeric_limits<int64_t>::max()} as a key
+/// and for Thrift - {CQL_PREP_ID_TYPE(0), THRIFT_PREP_ID}. This way CQL and Thrift keys' values will never collide.
+class prepared_cache_key_type {
+public:
+    using cache_key_type = std::pair<cql_prepared_id_type, int64_t>;
+
+private:
+    cache_key_type _key;
+
+public:
+    prepared_cache_key_type() = default;
+    explicit prepared_cache_key_type(cql_prepared_id_type cql_id) : _key(std::move(cql_id), std::numeric_limits<int64_t>::max()) {}
+    explicit prepared_cache_key_type(thrift_prepared_id_type thrift_id) : _key(cql_prepared_id_type(), thrift_id) {}
+
+    cache_key_type& key() { return _key; }
+    const cache_key_type& key() const { return _key; }
+
+    static const cql_prepared_id_type& cql_id(const prepared_cache_key_type& key) {
+        return key.key().first;
+    }
+    static thrift_prepared_id_type thrift_id(const prepared_cache_key_type& key) {
+        return key.key().second;
+    }
+};
+
+class prepared_statements_cache {
+public:
+    struct stats {
+        uint64_t prepared_cache_evictions = 0;
+    };
+
+    static stats& shard_stats() {
+        static thread_local stats _stats;
+        return _stats;
+    }
+
+    struct prepared_cache_stats_updater {
+        static void inc_hits() noexcept {}
+        static void inc_misses() noexcept {}
+        static void inc_blocks() noexcept {}
+        static void inc_evictions() noexcept {
+            ++shard_stats().prepared_cache_evictions;
+        }
+    };
+
+private:
+    using cache_key_type = typename prepared_cache_key_type::cache_key_type;
+    using cache_type = utils::loading_cache<cache_key_type, prepared_cache_entry, utils::loading_cache_reload_enabled::no, prepared_cache_entry_size, utils::tuple_hash, std::equal_to<cache_key_type>, prepared_cache_stats_updater>;
+    using cache_value_ptr = typename cache_type::value_ptr;
+    using cache_iterator = typename cache_type::iterator;
+    using checked_weak_ptr = typename statements::prepared_statement::checked_weak_ptr;
+    struct value_extractor_fn {
+        checked_weak_ptr operator()(prepared_cache_entry& e) const {
+            return e->checked_weak_from_this();
+        }
+    };
+
+    static const std::chrono::minutes entry_expiry;
+
+public:
+    using key_type = prepared_cache_key_type;
+    using value_type = checked_weak_ptr;
+    using statement_is_too_big = typename cache_type::entry_is_too_big;
+    /// \note both iterator::reference and iterator::value_type are checked_weak_ptr
+    using iterator = boost::transform_iterator<value_extractor_fn, cache_iterator>;
+
+private:
+    cache_type _cache;
+    value_extractor_fn _value_extractor_fn;
+
+public:
+    prepared_statements_cache(logging::logger& logger)
+        : _cache(memory::stats().total_memory() / 256, entry_expiry, logger)
+    {}
+
+    template <typename LoadFunc>
+    future<value_type> get(const key_type& key, LoadFunc&& load) {
+        return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
+            return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
+        });
+    }
+
+    iterator find(const key_type& key) {
+        return boost::make_transform_iterator(_cache.find(key.key()), _value_extractor_fn);
+    }
+
+    iterator end() {
+        return boost::make_transform_iterator(_cache.end(), _value_extractor_fn);
+    }
+
+    iterator begin() {
+        return boost::make_transform_iterator(_cache.begin(), _value_extractor_fn);
+    }
+
+    template <typename Pred>
+    void remove_if(Pred&& pred) {
+        static_assert(std::is_same<bool, std::result_of_t<Pred(::shared_ptr<cql_statement>)>>::value, "Bad Pred signature");
+
+        _cache.remove_if([&pred] (const prepared_cache_entry& e) {
+            return pred(e->statement);
+        });
+    }
+
+    size_t size() const {
+        return _cache.size();
+    }
+
+    size_t memory_footprint() const {
+        return _cache.memory_footprint();
+    }
+};
+}
+
+namespace std { // for prepared_statements_cache log printouts
+inline std::ostream& operator<<(std::ostream& os, const typename cql3::prepared_cache_key_type::cache_key_type& p) {
+    os << "{cql_id: " << p.first << ", thrift_id: " << p.second << "}";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const cql3::prepared_cache_key_type& p) {
+    os << p.key();
+    return os;
+}
+}
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -57,11 +57,14 @@ using namespace statements;
 using namespace cql_transport::messages;

 logging::logger log("query_processor");
+logging::logger prep_cache_log("prepared_statements_cache");

 distributed<query_processor> _the_query_processor;

 const sstring query_processor::CQL_VERSION = "3.3.1";

+const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono::minutes(60);
+
 class query_processor::internal_state {
    service::query_state _qs;
 public:
@@ -95,6 +98,7 @@ query_processor::query_processor(distributed<service::storage_proxy>& proxy,
    , _proxy(proxy)
    , _db(db)
    , _internal_state(new internal_state())
+    , _prepared_cache(prep_cache_log)
 {
    namespace sm = seastar::metrics;

@@ -130,6 +134,15 @@ query_processor::query_processor(distributed<service::storage_proxy>& proxy,

        sm::make_derive("batches_unlogged_from_logged", _cql_stats.batches_unlogged_from_logged,
                        sm::description("Counts a total number of LOGGED batches that were executed as UNLOGGED batches.")),
+
+        sm::make_derive("prepared_cache_evictions", [] { return prepared_statements_cache::shard_stats().prepared_cache_evictions; },
+                        sm::description("Counts a number of prepared statements cache entries evictions.")),
+
+        sm::make_gauge("prepared_cache_size", [this] { return _prepared_cache.size(); },
+                        sm::description("A number of entries in the prepared statements cache.")),
+
+        sm::make_gauge("prepared_cache_memory_footprint", [this] { return _prepared_cache.memory_footprint(); },
+                        sm::description("Size (in bytes) of the prepared statements cache.")),
    });

    service::get_local_migration_manager().register_listener(_migration_subscriber.get());
@@ -197,31 +210,21 @@ query_processor::process_statement(::shared_ptr<cql_statement> statement,
 }

 future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-query_processor::prepare(const std::experimental::string_view& query_string, service::query_state& query_state)
+query_processor::prepare(sstring query_string, service::query_state& query_state)
 {
    auto& client_state = query_state.get_client_state();
-    return prepare(query_string, client_state, client_state.is_thrift());
+    return prepare(std::move(query_string), client_state, client_state.is_thrift());
 }

 future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-query_processor::prepare(const std::experimental::string_view& query_string,
-                         const service::client_state& client_state,
-                         bool for_thrift)
+query_processor::prepare(sstring query_string, const service::client_state& client_state, bool for_thrift)
 {
-    auto existing = get_stored_prepared_statement(query_string, client_state.get_raw_keyspace(), for_thrift);
-    if (existing) {
-        return make_ready_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(existing);
+    using namespace cql_transport::messages;
+    if (for_thrift) {
+        return prepare_one<result_message::prepared::thrift>(std::move(query_string), client_state, compute_thrift_id, prepared_cache_key_type::thrift_id);
+    } else {
+        return prepare_one<result_message::prepared::cql>(std::move(query_string), client_state, compute_id, prepared_cache_key_type::cql_id);
    }
-
-    return futurize<::shared_ptr<cql_transport::messages::result_message::prepared>>::apply([this, &query_string, &client_state, for_thrift] {
-        auto prepared = get_statement(query_string, client_state);
-        auto bound_terms = prepared->statement->get_bound_terms();
-        if (bound_terms > std::numeric_limits<uint16_t>::max()) {
-            throw exceptions::invalid_request_exception(sprint("Too many markers(?). %d markers exceed the allowed maximum of %d", bound_terms, std::numeric_limits<uint16_t>::max()));
-        }
-        assert(bound_terms == prepared->bound_names.size());
-        return store_prepared_statement(query_string, client_state.get_raw_keyspace(), std::move(prepared), for_thrift);
-    });
 }

 ::shared_ptr<cql_transport::messages::result_message::prepared>
@@ -229,50 +232,11 @@ query_processor::get_stored_prepared_statement(const std::experimental::string_v
                                               const sstring& keyspace,
                                               bool for_thrift)
 {
+    using namespace cql_transport::messages;
    if (for_thrift) {
-        auto statement_id = compute_thrift_id(query_string, keyspace);
-        auto it = _thrift_prepared_statements.find(statement_id);
-        if (it == _thrift_prepared_statements.end()) {
-            return ::shared_ptr<result_message::prepared>();
-        }
-        return ::make_shared<result_message::prepared::thrift>(statement_id, it->second->checked_weak_from_this());
+        return get_stored_prepared_statement_one<result_message::prepared::thrift>(query_string, keyspace, compute_thrift_id, prepared_cache_key_type::thrift_id);
    } else {
-        auto statement_id = compute_id(query_string, keyspace);
-        auto it = _prepared_statements.find(statement_id);
-        if (it == _prepared_statements.end()) {
-            return ::shared_ptr<result_message::prepared>();
-        }
-        return ::make_shared<result_message::prepared::cql>(statement_id, it->second->checked_weak_from_this());
-    }
-}
-
-future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-query_processor::store_prepared_statement(const std::experimental::string_view& query_string,
-                                          const sstring& keyspace,
-                                          std::unique_ptr<statements::prepared_statement> prepared,
-                                          bool for_thrift)
-{
-#if 0
-    // Concatenate the current keyspace so we don't mix prepared statements between keyspace (#5352).
-    // (if the keyspace is null, queryString has to have a fully-qualified keyspace so it's fine.
-    long statementSize = measure(prepared.statement);
-    // don't execute the statement if it's bigger than the allowed threshold
-    if (statementSize > MAX_CACHE_PREPARED_MEMORY)
-        throw new InvalidRequestException(String.format("Prepared statement of size %d bytes is larger than allowed maximum of %d bytes.",
-                                                        statementSize,
-                                                        MAX_CACHE_PREPARED_MEMORY));
-#endif
-    prepared->raw_cql_statement = query_string.data();
-    if (for_thrift) {
-        auto statement_id = compute_thrift_id(query_string, keyspace);
-        auto msg = ::make_shared<result_message::prepared::thrift>(statement_id, prepared->checked_weak_from_this());
-        _thrift_prepared_statements.emplace(statement_id, std::move(prepared));
-        return make_ready_future<::shared_ptr<result_message::prepared>>(std::move(msg));
-    } else {
-        auto statement_id = compute_id(query_string, keyspace);
-        auto msg = ::make_shared<result_message::prepared::cql>(statement_id, prepared->checked_weak_from_this());
-        _prepared_statements.emplace(statement_id, std::move(prepared));
-        return make_ready_future<::shared_ptr<result_message::prepared>>(std::move(msg));
+        return get_stored_prepared_statement_one<result_message::prepared::cql>(query_string, keyspace, compute_id, prepared_cache_key_type::cql_id);
    }
 }

@@ -289,19 +253,19 @@ static sstring hash_target(const std::experimental::string_view& query_string, c
    return keyspace + query_string.to_string();
 }

-bytes query_processor::compute_id(const std::experimental::string_view& query_string, const sstring& keyspace)
+prepared_cache_key_type query_processor::compute_id(const std::experimental::string_view& query_string, const sstring& keyspace)
 {
-    return md5_calculate(hash_target(query_string, keyspace));
+    return prepared_cache_key_type(md5_calculate(hash_target(query_string, keyspace)));
 }

-int32_t query_processor::compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace)
+prepared_cache_key_type query_processor::compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace)
 {
    auto target = hash_target(query_string, keyspace);
    uint32_t h = 0;
    for (auto&& c : hash_target(query_string, keyspace)) {
        h = 31*h + c;
    }
-    return static_cast<int32_t>(h);
+    return prepared_cache_key_type(static_cast<int32_t>(h));
 }

 std::unique_ptr<prepared_statement>
@@ -527,7 +491,7 @@ void query_processor::migration_subscriber::on_drop_view(const sstring& ks_name,

 void query_processor::migration_subscriber::remove_invalid_prepared_statements(sstring ks_name, std::experimental::optional<sstring> cf_name)
 {
-    _qp->invalidate_prepared_statements([&] (::shared_ptr<cql_statement> stmt) {
+    _qp->_prepared_cache.remove_if([&] (::shared_ptr<cql_statement> stmt) {
        return this->should_invalidate(ks_name, cf_name, stmt);
    });
 }
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -57,6 +57,7 @@
 #include "statements/prepared_statement.hh"
 #include "transport/messages/result_message.hh"
 #include "untyped_result_set.hh"
+#include "prepared_statements_cache.hh"

 namespace cql3 {

@@ -64,9 +65,32 @@ namespace statements {
 class batch_statement;
 }

+class prepared_statement_is_too_big : public std::exception {
+public:
+    static constexpr int max_query_prefix = 100;
+
+private:
+    sstring _msg;
+
+public:
+    prepared_statement_is_too_big(const sstring& query_string)
+        : _msg(seastar::format("Prepared statement is too big: {}", query_string.substr(0, max_query_prefix)))
+    {
+        // mark that we clipped the query string
+        if (query_string.size() > max_query_prefix) {
+            _msg += "...";
+        }
+    }
+
+    virtual const char* what() const noexcept override {
+        return _msg.c_str();
+    }
+};
+
 class query_processor {
 public:
    class migration_subscriber;
+
 private:
    std::unique_ptr<migration_subscriber> _migration_subscriber;
    distributed<service::storage_proxy>& _proxy;
@@ -127,9 +151,7 @@ private:
        }
    };
 #endif
-
-    std::unordered_map<bytes, std::unique_ptr<statements::prepared_statement>> _prepared_statements;
-    std::unordered_map<int32_t, std::unique_ptr<statements::prepared_statement>> _thrift_prepared_statements;
+    prepared_statements_cache _prepared_cache;
    std::unordered_map<sstring, std::unique_ptr<statements::prepared_statement>> _internal_statements;
 #if 0

@@ -221,21 +243,14 @@ private:
    }
 #endif
 public:
-    statements::prepared_statement::checked_weak_ptr get_prepared(const bytes& id) {
-        auto it = _prepared_statements.find(id);
-        if (it == _prepared_statements.end()) {
+    statements::prepared_statement::checked_weak_ptr get_prepared(const prepared_cache_key_type& key) {
+        auto it = _prepared_cache.find(key);
+        if (it == _prepared_cache.end()) {
            return statements::prepared_statement::checked_weak_ptr();
        }
-        return it->second->checked_weak_from_this();
+        return *it;
    }

-    statements::prepared_statement::checked_weak_ptr get_prepared_for_thrift(int32_t id) {
-        auto it = _thrift_prepared_statements.find(id);
-        if (it == _thrift_prepared_statements.end()) {
-            return statements::prepared_statement::checked_weak_ptr();
-        }
-        return it->second->checked_weak_from_this();
-    }
 #if 0
    public static void validateKey(ByteBuffer key) throws InvalidRequestException
    {
@@ -435,42 +450,61 @@ public:
 #endif

    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-    prepare(const std::experimental::string_view& query_string, service::query_state& query_state);
+    prepare(sstring query_string, service::query_state& query_state);

    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-    prepare(const std::experimental::string_view& query_string, const service::client_state& client_state, bool for_thrift);
+    prepare(sstring query_string, const service::client_state& client_state, bool for_thrift);

-    static bytes compute_id(const std::experimental::string_view& query_string, const sstring& keyspace);
-    static int32_t compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace);
+    static prepared_cache_key_type compute_id(const std::experimental::string_view& query_string, const sstring& keyspace);
+    static prepared_cache_key_type compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace);

 private:
+    ///
+    /// \tparam ResultMsgType type of the returned result message (CQL or Thrift)
+    /// \tparam PreparedKeyGenerator a function that generates the prepared statement cache key for given query and keyspace
+    /// \tparam IdGetter a function that returns the corresponding prepared statement ID (CQL or Thrift) for a given prepared statement cache key
+    /// \param query_string
+    /// \param client_state
+    /// \param id_gen prepared ID generator, called before the first deferring
+    /// \param id_getter prepared ID getter, passed to deferred context by reference. The caller must ensure its liveness.
+    /// \return
+    template <typename ResultMsgType, typename PreparedKeyGenerator, typename IdGetter>
+    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
+    prepare_one(sstring query_string, const service::client_state& client_state, PreparedKeyGenerator&& id_gen, IdGetter&& id_getter) {
+        return do_with(id_gen(query_string, client_state.get_raw_keyspace()), std::move(query_string), [this, &client_state, &id_getter] (const prepared_cache_key_type& key, const sstring& query_string) {
+            return _prepared_cache.get(key, [this, &query_string, &client_state] {
+                auto prepared = get_statement(query_string, client_state);
+                auto bound_terms = prepared->statement->get_bound_terms();
+                if (bound_terms > std::numeric_limits<uint16_t>::max()) {
+                    throw exceptions::invalid_request_exception(sprint("Too many markers(?). %d markers exceed the allowed maximum of %d", bound_terms, std::numeric_limits<uint16_t>::max()));
+                }
+                assert(bound_terms == prepared->bound_names.size());
+                prepared->raw_cql_statement = query_string;
+                return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
+            }).then([&key, &id_getter] (auto prep_ptr) {
+                return make_ready_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(::make_shared<ResultMsgType>(id_getter(key), std::move(prep_ptr)));
+            }).handle_exception_type([&query_string] (typename prepared_statements_cache::statement_is_too_big&) {
+                return make_exception_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(prepared_statement_is_too_big(query_string));
+            });
+        });
+    };
+
+    template <typename ResultMsgType, typename KeyGenerator, typename IdGetter>
+    ::shared_ptr<cql_transport::messages::result_message::prepared>
+    get_stored_prepared_statement_one(const std::experimental::string_view& query_string, const sstring& keyspace, KeyGenerator&& key_gen, IdGetter&& id_getter)
+    {
+        auto cache_key = key_gen(query_string, keyspace);
+        auto it = _prepared_cache.find(cache_key);
+        if (it == _prepared_cache.end()) {
+            return ::shared_ptr<cql_transport::messages::result_message::prepared>();
+        }
+
+        return ::make_shared<ResultMsgType>(id_getter(cache_key), *it);
+    }
+
    ::shared_ptr<cql_transport::messages::result_message::prepared>
    get_stored_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, bool for_thrift);

-    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-    store_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, std::unique_ptr<statements::prepared_statement> prepared, bool for_thrift);
-
-    // Erases the statements for which filter returns true.
-    template <typename Pred>
-    void invalidate_prepared_statements(Pred filter) {
-        static_assert(std::is_same<bool, std::result_of_t<Pred(::shared_ptr<cql_statement>)>>::value,
-                      "bad Pred signature");
-        for (auto it = _prepared_statements.begin(); it != _prepared_statements.end(); ) {
-            if (filter(it->second->statement)) {
-                it = _prepared_statements.erase(it);
-            } else {
-                ++it;
-            }
-        }
-        for (auto it = _thrift_prepared_statements.begin(); it != _thrift_prepared_statements.end(); ) {
-            if (filter(it->second->statement)) {
-                it = _thrift_prepared_statements.erase(it);
-            } else {
-                ++it;
-            }
-        }
-    }
-
 #if 0
    public ResultMessage processPrepared(CQLStatement statement, QueryState queryState, QueryOptions options)
    throws RequestExecutionException, RequestValidationException
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -101,6 +101,10 @@ public:
        return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->has_bound(b); });
    }

+    virtual bool is_inclusive(statements::bound b) const override {
+        return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->is_inclusive(b); });
+    }
+
    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
        return _restrictions->uses_function(ks_name, function_name);
    }
@@ -120,7 +124,7 @@ public:
                if (restriction->is_slice()) {
                    throw exceptions::invalid_request_exception(sprint(
                        "PRIMARY KEY column \"%s\" cannot be restricted (preceding column \"%s\" is restricted by a non-EQ relation)",
-                        _restrictions->next_column(new_column)->name_as_text(), new_column.name_as_text()));
+                        last_column.name_as_text(), new_column.name_as_text()));
                }
            }

--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -63,7 +63,7 @@ void cql3::statements::alter_keyspace_statement::validate(distributed<service::s
        service::get_local_storage_proxy().get_db().local().find_keyspace(_name); // throws on failure
        auto tmp = _name;
        std::transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower);
-        if (tmp == db::system_keyspace::NAME) {
+        if (is_system_keyspace(tmp)) {
            throw exceptions::invalid_request_exception("Cannot alter system keyspace");
        }

--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -41,6 +41,8 @@

 #include "cql3/statements/cf_prop_defs.hh"

+#include <boost/algorithm/string/predicate.hpp>
+
 namespace cql3 {

 namespace statements {
@@ -65,6 +67,8 @@ const sstring cf_prop_defs::KW_CRC_CHECK_CHANCE = "crc_check_chance";

 const sstring cf_prop_defs::COMPACTION_STRATEGY_CLASS_KEY = "class";

+const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";
+
 void cf_prop_defs::validate() {
    // Skip validation if the comapction strategy class is already set as it means we've alreayd
    // prepared (and redoing it would set strategyClass back to null, which we don't want)
@@ -188,6 +192,13 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder) {
    builder.set_min_compaction_threshold(min_compaction_threshold);
    builder.set_max_compaction_threshold(max_compaction_threshold);

+    if (has_property(KW_COMPACTION)) {
+        if (get_compaction_options().count(COMPACTION_ENABLED_KEY)) {
+            auto enabled = boost::algorithm::iequals(get_compaction_options().at(COMPACTION_ENABLED_KEY), "true");
+            builder.set_compaction_enabled(enabled);
+        }
+    }
+
    builder.set_default_time_to_live(gc_clock::duration(get_int(KW_DEFAULT_TIME_TO_LIVE, DEFAULT_DEFAULT_TIME_TO_LIVE)));

    if (has_property(KW_SPECULATIVE_RETRY)) {
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -73,6 +73,7 @@ public:
    static const sstring KW_CRC_CHECK_CHANCE;

    static const sstring COMPACTION_STRATEGY_CLASS_KEY;
+    static const sstring COMPACTION_ENABLED_KEY;

    // FIXME: In origin the following consts are in CFMetaData.
    static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -72,7 +72,7 @@ void create_keyspace_statement::validate(distributed<service::storage_proxy>&, c
    std::string name;
    name.resize(_name.length());
    std::transform(_name.begin(), _name.end(), name.begin(), ::tolower);
-    if (name == db::system_keyspace::NAME) {
+    if (is_system_keyspace(name)) {
        throw exceptions::invalid_request_exception("system keyspace is not user-modifiable");
    }
    // keyspace name
--- a/cql3/statements/create_user_statement.cc
+++ b/cql3/statements/create_user_statement.cc
@@ -75,7 +75,7 @@ cql3::statements::create_user_statement::execute(distributed<service::storage_pr
                throw exceptions::invalid_request_exception(sprint("User %s already exists", _username));
            }
            if (exists && _if_not_exists) {
-                make_ready_future<::shared_ptr<cql_transport::messages::result_message>>();
+                return make_ready_future<::shared_ptr<cql_transport::messages::result_message>>();
            }
            return auth::authenticator::get().create(_username, _opts->options()).then([this] {
                return auth::auth::insert_user(_username, _superuser).then([] {
--- a/cql3/statements/delete_statement.cc
+++ b/cql3/statements/delete_statement.cc
@@ -106,6 +106,9 @@ delete_statement::prepare_internal(database& db, schema_ptr schema, shared_ptr<v
            || !stmt->restrictions()->get_clustering_columns_restrictions()->has_bound(bound::END)) {
        throw exceptions::invalid_request_exception("A range deletion operation needs to specify both bounds");
    }
+    if (!schema->is_compound() && stmt->restrictions()->get_clustering_columns_restrictions()->is_slice()) {
+        throw exceptions::invalid_request_exception("Range deletions on \"compact storage\" schemas are not supported");
+    }
    return stmt;
 }

--- a/database.cc
+++ b/database.cc
@@ -65,13 +65,13 @@
 #include <core/fstream.hh>
 #include <seastar/core/enum.hh>
 #include "utils/latency.hh"
-#include "utils/flush_queue.hh"
 #include "schema_registry.hh"
 #include "service/priority_manager.hh"
 #include "cell_locking.hh"
 #include <seastar/core/execution_stage.hh>
 #include "view_info.hh"
 #include "memtable-sstable.hh"
+#include "db/schema_tables.hh"

 #include "checked-file-impl.hh"
 #include "disk-error-handler.hh"
@@ -84,28 +84,10 @@ static const std::unordered_set<sstring> system_keyspaces = {
                db::system_keyspace::NAME, db::schema_tables::NAME
 };

-static bool is_system_keyspace(const sstring& name) {
+bool is_system_keyspace(const sstring& name) {
    return system_keyspaces.find(name) != system_keyspaces.end();
 }

-// Slight extension to the flush_queue type.
-class column_family::memtable_flush_queue : public utils::flush_queue<db::replay_position> {
-public:
-    template<typename Func, typename Post>
-    auto run_cf_flush(db::replay_position rp, Func&& func, Post&& post) {
-        // special case: empty rp, yet still data.
-        // We generate a few memtables with no valid, "high_rp", yet
-        // still containing data -> actual flush.
-        // And to make matters worse, we can initiate a flush of N such
-        // tables at the same time.
-        // Just queue them at the end of the queue and treat them as such.
-        if (rp == db::replay_position() && !empty()) {
-            rp = highest_key();
-        }
-        return run_with_ordered_post_op(rp, std::forward<Func>(func), std::forward<Post>(post));
-    }
-};
-
 // Used for tests where the CF exists without a database object. We need to pass a valid
 // dirty_memory manager in that case.
 thread_local dirty_memory_manager default_dirty_memory_manager;
@@ -147,7 +129,6 @@ column_family::column_family(schema_ptr schema, config config, db::commitlog* cl
    , _cache(_schema, sstables_as_snapshot_source(), global_cache_tracker())
    , _commitlog(cl)
    , _compaction_manager(compaction_manager)
-    , _flush_queue(std::make_unique<memtable_flush_queue>())
    , _counter_cell_locks(std::make_unique<cell_locker>(_schema, cl_stats))
 {
    if (!_config.enable_disk_writes) {
@@ -190,7 +171,6 @@ column_family::sstables_as_mutation_source() {
 snapshot_source
 column_family::sstables_as_snapshot_source() {
    return snapshot_source([this] () {
-        // FIXME: Will keep sstables on disk until next memtable flush. Make compaction force cache refresh.
        auto sst_set = _sstables;
        return mutation_source([this, sst_set = std::move(sst_set)] (schema_ptr s,
                const dht::partition_range& r,
@@ -779,6 +759,9 @@ column_family::open_sstable(sstables::foreign_sstable_open_info info, sstring di
 }

 void column_family::load_sstable(sstables::shared_sstable& sst, bool reset_level) {
+    if (schema()->is_counter() && !sst->has_scylla_component()) {
+        throw std::runtime_error("Loading non-Scylla SSTables containing counters is not supported. Use sstableloader instead.");
+    }
    auto shards = sst->get_shards_for_this_sstable();
    if (belongs_to_other_shard(shards)) {
        // If we're here, this sstable is shared by this and other
@@ -890,18 +873,21 @@ column_family::seal_active_streaming_memtable_immediate() {
            //
            // Lastly, we don't have any commitlog RP to update, and we don't need to deal manipulate the
            // memtable list, since this memtable was not available for reading up until this point.
-            return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority).then([this, newtab, old] {
+            return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, false, _config.background_writer_scheduling_group).then([this, newtab, old] {
                return newtab->open_data();
            }).then([this, old, newtab] () {
-                add_sstable(newtab, {engine().cpu_id()});
-                trigger_compaction();
-                // Cache synchronization must be started atomically with add_sstable()
-                if (_config.enable_cache) {
-                    return _cache.update_invalidating(*old);
-                } else {
-                    return old->clear_gently();
-                }
-            }).handle_exception([old] (auto ep) {
+                return with_semaphore(_cache_update_sem, 1, [this, newtab, old] {
+                    add_sstable(newtab, {engine().cpu_id()});
+                    trigger_compaction();
+                    // Cache synchronization must be started atomically with add_sstable()
+                    if (_config.enable_cache) {
+                        return _cache.update_invalidating(*old);
+                    } else {
+                        return old->clear_gently();
+                    }
+                });
+            }).handle_exception([old, newtab] (auto ep) {
+                newtab->mark_for_deletion();
                dblog.error("failed to write streamed sstable: {}", ep);
                return make_exception_future<>(ep);
            });
@@ -937,9 +923,10 @@ future<> column_family::seal_active_streaming_memtable_big(streaming_memtable_bi
                newtab->set_unshared();

                auto&& priority = service::get_local_streaming_write_priority();
-                return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, true).then([this, newtab, old, &smb] {
+                return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, true, _config.background_writer_scheduling_group).then([this, newtab, old, &smb] {
                    smb.sstables.emplace_back(newtab);
-                }).handle_exception([] (auto ep) {
+                }).handle_exception([newtab] (auto ep) {
+                    newtab->mark_for_deletion();
                    dblog.error("failed to write streamed sstable: {}", ep);
                    return make_exception_future<>(ep);
                });
@@ -955,34 +942,32 @@ column_family::seal_active_memtable(memtable_list::flush_behavior ignored) {

    if (old->empty()) {
        dblog.debug("Memtable is empty");
-        return make_ready_future<>();
+        return _flush_barrier.advance_and_await();
    }
    _memtables->add_memtable();
+    _stats.memtable_switch_count++;
+    auto previous_flush = _flush_barrier.advance_and_await();
+    auto op = _flush_barrier.start();

-    assert(_highest_flushed_rp < old->replay_position()
-    || (_highest_flushed_rp == db::replay_position() && old->replay_position() == db::replay_position())
-    );
-    _highest_flushed_rp = old->replay_position();
+    auto memtable_size = old->occupancy().total_space();

-    return _flush_queue->run_cf_flush(old->replay_position(), [old, this] {
-      auto memtable_size = old->occupancy().total_space();
+    _stats.pending_flushes++;
+    _config.cf_stats->pending_memtables_flushes_count++;
+    _config.cf_stats->pending_memtables_flushes_bytes += memtable_size;

-      _config.cf_stats->pending_memtables_flushes_count++;
-      _config.cf_stats->pending_memtables_flushes_bytes += memtable_size;
-
-      return repeat([this, old] {
+    return repeat([this, old] {
        return with_lock(_sstables_lock.for_read(), [this, old] {
-            _flush_queue->check_open_gate();
            return try_flush_memtable_to_sstable(old);
        });
-      }).then([this, memtable_size] {
+    }).then([this, memtable_size, old, op = std::move(op), previous_flush = std::move(previous_flush)] () mutable {
+        _stats.pending_flushes--;
        _config.cf_stats->pending_memtables_flushes_count--;
        _config.cf_stats->pending_memtables_flushes_bytes -= memtable_size;
-      });
-    }, [old, this] {
+
        if (_commitlog) {
            _commitlog->discard_completed_segments(_schema->id(), old->rp_set());
        }
+        return previous_flush.finally([op = std::move(op)] { });
    });
    // FIXME: release commit log
    // FIXME: provide back-pressure to upper layers
@@ -1011,7 +996,7 @@ column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
    // The code as is guarantees that we'll never partially backup a
    // single sstable, so that is enough of a guarantee.
    auto&& priority = service::get_local_memtable_flush_priority();
-    return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority).then([this, newtab, old] {
+    return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, false, _config.memtable_scheduling_group).then([this, newtab, old] {
        return newtab->open_data();
    }).then_wrapped([this, old, newtab] (future<> ret) {
        dblog.debug("Flushing to {} done", newtab->get_filename());
@@ -1067,9 +1052,7 @@ column_family::stop() {
    return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
        return _compaction_manager.remove(this).then([this] {
            // Nest, instead of using when_all, so we don't lose any exceptions.
-            return _flush_queue->close().then([this] {
-                return _streaming_flush_gate.close();
-            });
+            return _streaming_flush_gate.close();
        }).then([this] {
            return _sstable_deletion_gate.close();
        });
@@ -1123,7 +1106,10 @@ distributed_loader::flush_upload_dir(distributed<database>& db, sstring ks_name,
                    auto gen = cf.calculate_generation_for_new_table();

                    // Read toc content as it will be needed for moving and deleting a sstable.
-                    return sst->read_toc().then([sst] {
+                    return sst->read_toc().then([sst, s = cf.schema()] {
+                        if (s->is_counter() && !sst->has_scylla_component()) {
+                            return make_exception_future<>(std::runtime_error("Loading non-Scylla SSTables containing counters is not supported. Use sstableloader instead."));
+                        }
                        return sst->mutate_sstable_level(0);
                    }).then([&cf, sst, gen] {
                        return sst->create_links(cf._config.datadir, gen);
@@ -1208,20 +1194,22 @@ void column_family::set_metrics() {
    auto cf = column_family_label(_schema->cf_name());
    auto ks = keyspace_label(_schema->ks_name());
    namespace ms = seastar::metrics;
-    _metrics.add_group("column_family", {
-            ms::make_histogram("read_latency", ms::description("Read latency histogram"), [this] {return _stats.estimated_read.get_histogram();})(cf)(ks),
-            ms::make_histogram("write_latency", ms::description("Write latency histogram"), [this] {return _stats.estimated_write.get_histogram();})(cf)(ks),
-            ms::make_derive("memtable_switch", ms::description("Number of times flush has resulted in the memtable being switched out"), _stats.memtable_switch_count)(cf)(ks),
-            ms::make_gauge("pending_taks", ms::description("Estimated number of tasks pending for this column family"), _stats.pending_flushes)(cf)(ks),
-            ms::make_gauge("live_disk_space", ms::description("Live disk space used"), _stats.live_disk_space_used)(cf)(ks),
-            ms::make_gauge("total_disk_space", ms::description("Total disk space used"), _stats.total_disk_space_used)(cf)(ks),
-            ms::make_gauge("live_sstable", ms::description("Live sstable count"), _stats.live_sstable_count)(cf)(ks),
-            ms::make_gauge("pending_compaction", ms::description("Estimated number of compactions pending for this column family"), _stats.pending_compactions)(cf)(ks)
-    });
-    if (_schema->ks_name() != db::system_keyspace::NAME) {
+    if (_config.enable_metrics_reporting) {
        _metrics.add_group("column_family", {
-                ms::make_gauge("cache_hit_rate", ms::description("Cache hit rate"), [this] {return float(_global_cache_hit_rate);})(cf)(ks)
+                ms::make_derive("memtable_switch", ms::description("Number of times flush has resulted in the memtable being switched out"), _stats.memtable_switch_count)(cf)(ks),
+                ms::make_gauge("pending_tasks", ms::description("Estimated number of tasks pending for this column family"), _stats.pending_flushes)(cf)(ks),
+                ms::make_gauge("live_disk_space", ms::description("Live disk space used"), _stats.live_disk_space_used)(cf)(ks),
+                ms::make_gauge("total_disk_space", ms::description("Total disk space used"), _stats.total_disk_space_used)(cf)(ks),
+                ms::make_gauge("live_sstable", ms::description("Live sstable count"), _stats.live_sstable_count)(cf)(ks),
+                ms::make_gauge("pending_compaction", ms::description("Estimated number of compactions pending for this column family"), _stats.pending_compactions)(cf)(ks)
        });
+        if (_schema->ks_name() != db::system_keyspace::NAME && _schema->ks_name() != db::schema_tables::v3::NAME && _schema->ks_name() != "system_traces") {
+            _metrics.add_group("column_family", {
+                    ms::make_histogram("read_latency", ms::description("Read latency histogram"), [this] {return _stats.estimated_read.get_histogram(std::chrono::microseconds(100));})(cf)(ks),
+                    ms::make_histogram("write_latency", ms::description("Write latency histogram"), [this] {return _stats.estimated_write.get_histogram(std::chrono::microseconds(100));})(cf)(ks),
+                    ms::make_gauge("cache_hit_rate", ms::description("Cache hit rate"), [this] {return float(_global_cache_hit_rate);})(cf)(ks)
+            });
+        }
    }
 }

@@ -1311,6 +1299,10 @@ column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>&
            } catch (sstables::atomic_deletion_cancelled& adc) {
                dblog.debug("Failed to delete sstables after compaction: {}", adc);
            }
+        }).then([this] {
+            // refresh underlying data source in row cache to prevent it from holding reference
+            // to sstables files which were previously deleted.
+            _cache.refresh_snapshot();
        });
    });
 }
@@ -1366,7 +1358,7 @@ column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool
                return sst;
        };
        return sstables::compact_sstables(*sstables_to_compact, *this, create_sstable, descriptor.max_sstable_bytes, descriptor.level,
-                cleanup).then([this, sstables_to_compact] (auto new_sstables) {
+                cleanup, _config.background_writer_scheduling_group).then([this, sstables_to_compact] (auto new_sstables) {
            _compaction_strategy.notify_completion(*sstables_to_compact, new_sstables);
            return this->rebuild_sstable_list(new_sstables, *sstables_to_compact);
        });
@@ -1374,7 +1366,7 @@ column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool
 }

 static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,
-                   const lw_shared_ptr<dht::token_range_vector>& owned_ranges,
+                   const dht::token_range_vector& owned_ranges,
                   schema_ptr s) {
    auto first = sst->get_first_partition_key();
    auto last = sst->get_last_partition_key();
@@ -1383,7 +1375,7 @@ static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,
    dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);

    // return true iff sst partition range isn't fully contained in any of the owned ranges.
-    for (auto& r : *owned_ranges) {
+    for (auto& r : owned_ranges) {
        if (r.contains(sst_token_range, dht::token_comparator())) {
            return false;
        }
@@ -1393,11 +1385,10 @@ static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,

 future<> column_family::cleanup_sstables(sstables::compaction_descriptor descriptor) {
    dht::token_range_vector r = service::get_local_storage_service().get_local_ranges(_schema->ks_name());
-    auto owned_ranges = make_lw_shared<dht::token_range_vector>(std::move(r));
-    auto sstables_to_cleanup = make_lw_shared<std::vector<sstables::shared_sstable>>(std::move(descriptor.sstables));

-    return do_for_each(*sstables_to_cleanup, [this, owned_ranges = std::move(owned_ranges), sstables_to_cleanup] (auto& sst) {
-        if (!owned_ranges->empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
+  return do_with(std::move(descriptor.sstables), std::move(r), [this] (auto& sstables, auto& owned_ranges) {
+    return do_for_each(sstables, [this, &owned_ranges] (auto& sst) {
+        if (!owned_ranges.empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
           return make_ready_future<>();
        }

@@ -1411,6 +1402,7 @@ future<> column_family::cleanup_sstables(sstables::compaction_descriptor descrip
            return this->compact_sstables(sstables::compaction_descriptor({ sst }, sst->get_sstable_level()), true);
        });
    });
+  });
 }

 // FIXME: this is just an example, should be changed to something more general
@@ -1673,12 +1665,12 @@ template <typename Func>
 static future<> invoke_all_resharding_jobs(global_column_family_ptr cf, std::vector<sstables::resharding_descriptor> jobs, Func&& func) {
    return parallel_for_each(std::move(jobs), [cf, func] (sstables::resharding_descriptor& job) mutable {
        return forward_sstables_to(job.reshard_at, std::move(job.sstables), cf,
-                [func, level = job.level, max_sstable_bytes = job.max_sstable_bytes] (auto sstables) {
-            // used to ensure that only one reshard operation will run per shard.
-            static thread_local semaphore sem(1);
-            return with_semaphore(sem, 1, [func, sstables = std::move(sstables), level, max_sstable_bytes] () mutable {
+                [cf, func, level = job.level, max_sstable_bytes = job.max_sstable_bytes] (auto sstables) {
+            // compaction manager ensures that only one reshard operation will run per shard.
+            auto job = [func, sstables = std::move(sstables), level, max_sstable_bytes] () mutable {
                return func(std::move(sstables), level, max_sstable_bytes);
-            });
+            };
+            return cf->get_compaction_manager().run_resharding_job(&*cf, std::move(job));
        });
    });
 }
@@ -1733,7 +1725,7 @@ void distributed_loader::reshard(distributed<database>& db, sstring ks_name, sst
                        gc_clock::now(), default_io_error_handler_gen());
                    return sst;
                };
-                auto f = sstables::reshard_sstables(sstables, *cf, creator, max_sstable_bytes, level);
+                auto f = sstables::reshard_sstables(sstables, *cf, creator, max_sstable_bytes, level, cf->background_writer_scheduling_group());

                return f.then([&cf, sstables = std::move(sstables)] (std::vector<sstables::shared_sstable> new_sstables) mutable {
                    // an input sstable may belong to shard 1 and 2 and only have data which
@@ -1776,14 +1768,6 @@ void distributed_loader::reshard(distributed<database>& db, sstring ks_name, sst
                            });
                        }
                    });
-                }).then_wrapped([] (future<> f) {
-                    try {
-                        f.get();
-                    } catch (sstables::compaction_stop_exception& e) {
-                        dblog.info("resharding was abruptly stopped, reason: {}", e.what());
-                    } catch (...) {
-                        dblog.error("resharding failed: {}", std::current_exception());
-                    }
                });
            }).get();
        });
@@ -1805,15 +1789,17 @@ future<> distributed_loader::load_new_sstables(distributed<database>& db, sstrin
    }).then([&db, ks, cf] {
        return db.invoke_on_all([ks = std::move(ks), cfname = std::move(cf)] (database& db) {
            auto& cf = db.find_column_family(ks, cfname);
-            // atomically load all opened sstables into column family.
-            for (auto& sst : cf._sstables_opened_but_not_loaded) {
-                cf.load_sstable(sst, true);
-            }
-            cf._sstables_opened_but_not_loaded.clear();
-            cf.trigger_compaction();
-            // Drop entire cache for this column family because it may be populated
-            // with stale data.
-            return cf.get_row_cache().invalidate();
+            return with_semaphore(cf._cache_update_sem, 1, [&cf] {
+                // atomically load all opened sstables into column family.
+                for (auto& sst : cf._sstables_opened_but_not_loaded) {
+                    cf.load_sstable(sst, true);
+                }
+                cf._sstables_opened_but_not_loaded.clear();
+                cf.trigger_compaction();
+                // Drop entire cache for this column family because it may be populated
+                // with stale data.
+                return cf.get_row_cache().invalidate();
+            });
        });
    }).then([&db, ks, cf] () mutable {
        return smp::submit_to(0, [&db, ks = std::move(ks), cf = std::move(cf)] () mutable {
@@ -1989,6 +1975,15 @@ future<> distributed_loader::populate_column_family(distributed<database>& db, s

 }

+inline
+flush_cpu_controller
+make_flush_cpu_controller(db::config& cfg, seastar::thread_scheduling_group* backup, std::function<double()> fn) {
+    if (cfg.auto_adjust_flush_quota()) {
+        return flush_cpu_controller(250ms, cfg.virtual_dirty_soft_limit(), std::move(fn));
+    }
+    return flush_cpu_controller(flush_cpu_controller::disabled{backup});
+}
+
 utils::UUID database::empty_version = utils::UUID_gen::get_name_UUID(bytes{});

 database::database() : database(db::config())
@@ -2002,6 +1997,10 @@ database::database(const db::config& cfg)
    , _system_dirty_memory_manager(*this, 10 << 20, cfg.virtual_dirty_soft_limit())
    , _dirty_memory_manager(*this, memory::stats().total_memory() * 0.45, cfg.virtual_dirty_soft_limit())
    , _streaming_dirty_memory_manager(*this, memory::stats().total_memory() * 0.10, cfg.virtual_dirty_soft_limit())
+    , _background_writer_scheduling_group(1ms, _cfg->background_writer_scheduling_quota())
+    , _memtable_cpu_controller(make_flush_cpu_controller(*_cfg, &_background_writer_scheduling_group, [this, limit = 2.0f * _dirty_memory_manager.throttle_threshold()] {
+        return (_dirty_memory_manager.virtual_dirty_memory()) / limit;
+    }))
    , _version(empty_version)
    , _enable_incremental_backups(cfg.incremental_backups())
 {
@@ -2011,6 +2010,32 @@ database::database(const db::config& cfg)
    dblog.info("Row: max_vector_size: {}, internal_count: {}", size_t(row::max_vector_size), size_t(row::internal_count));
 }

+void flush_cpu_controller::adjust() {
+    auto mid = _goal + (hard_dirty_limit - _goal) / 2;
+
+    auto dirty = _current_dirty();
+    if (dirty < _goal) {
+        _current_quota = dirty * q1 / _goal;
+    } else if ((dirty >= _goal) && (dirty < mid)) {
+        _current_quota = q1 + (dirty - _goal) * (q2 - q1)/(mid - _goal);
+    } else {
+        _current_quota = q2 + (dirty - mid) * (qmax - q2) / (hard_dirty_limit - mid);
+    }
+
+    dblog.trace("dirty {}, goal {}, mid {} quota {}", dirty, _goal, mid, _current_quota);
+    _scheduling_group.update_usage(_current_quota);
+}
+
+flush_cpu_controller::flush_cpu_controller(std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
+    : _goal(soft_limit / 2)
+    , _current_dirty(std::move(current_dirty))
+    , _interval(interval)
+    , _update_timer([this] { adjust(); })
+    , _scheduling_group(1ms, 0.0f)
+    , _current_scheduling_group(&_scheduling_group)
+{
+    _update_timer.arm_periodic(_interval);
+}

 void
 dirty_memory_manager::setup_collectd(sstring namestr) {
@@ -2108,6 +2133,14 @@ database::setup_metrics() {
        sm::make_gauge("queued_reads", [this] { return _read_concurrency_sem.waiters(); },
                       sm::description("Holds the number of currently queued read operations.")),

+        sm::make_gauge("active_reads_streaming", [this] { return max_streaming_concurrent_reads() - _streaming_concurrency_sem.current(); },
+                       sm::description(seastar::format("Holds the number of currently active read operations issued on behalf of streaming "
+                                                       "If this value gets close to {} we are likely to start dropping new read requests. "
+                                                       "In that case sstable_read_queue_overloads is going to get a non-zero value.", max_streaming_concurrent_reads()))),
+
+        sm::make_gauge("queued_reads_streaming", [this] { return _streaming_concurrency_sem.waiters(); },
+                       sm::description("Holds the number of currently queued read operations on behalf of streaming.")),
+
        sm::make_gauge("active_reads_system_keyspace", [this] { return max_system_concurrent_reads() - _system_read_concurrency_sem.current(); },
                       sm::description(seastar::format("Holds the number of currently active read operations from \"system\" keyspace tables. "
                                                       "If this vlaue gets close to {} we are likely to start dropping new read requests. "
@@ -2119,6 +2152,9 @@ database::setup_metrics() {
        sm::make_gauge("total_result_bytes", [this] { return get_result_memory_limiter().total_used_memory(); },
                       sm::description("Holds the current amount of memory used for results.")),

+        sm::make_gauge("cpu_flush_quota", [this] { return _memtable_cpu_controller.current_quota(); },
+                             sm::description("The current quota for memtable CPU scheduling group")),
+
        sm::make_derive("short_data_queries", _stats->short_data_queries,
                       sm::description("The rate of data queries (data or digest reads) that returned less rows than requested due to result size limiting.")),

@@ -2330,7 +2366,7 @@ database::init_commitlog() {
                _commitlog->discard_completed_segments(id);
                return;
            }
-            _column_families[id]->flush(pos);
+            _column_families[id]->flush();
        }).release(); // we have longer life time than CL. Ignore reg anchor
    });
 }
@@ -2444,12 +2480,12 @@ void database::remove(const column_family& cf) {
    }
 }

-future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func tsf) {
+future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func tsf, bool snapshot) {
    auto uuid = find_uuid(ks_name, cf_name);
    auto cf = _column_families.at(uuid);
    remove(*cf);
    auto& ks = find_keyspace(ks_name);
-    return truncate(ks, *cf, std::move(tsf)).then([this, cf] {
+    return truncate(ks, *cf, std::move(tsf), snapshot).then([this, cf] {
        return cf->stop();
    }).then([this, cf] {
        return make_ready_future<>();
@@ -2589,6 +2625,9 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
    cfg.streaming_read_concurrency_config = _config.streaming_read_concurrency_config;
    cfg.cf_stats = _config.cf_stats;
    cfg.enable_incremental_backups = _config.enable_incremental_backups;
+    cfg.background_writer_scheduling_group = _config.background_writer_scheduling_group;
+    cfg.memtable_scheduling_group = _config.memtable_scheduling_group;
+    cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics();

    return cfg;
 }
@@ -3035,7 +3074,7 @@ void column_family::apply_streaming_big_mutation(schema_ptr m_schema, utils::UUI
 void
 column_family::check_valid_rp(const db::replay_position& rp) const {
    if (rp != db::replay_position() && rp < _lowest_allowed_rp) {
-        throw replay_position_reordered_exception();
+        throw mutation_reordered_with_truncate_exception();
    }
 }

@@ -3079,10 +3118,6 @@ lw_shared_ptr<memtable> memtable_list::new_memtable() {
 }

 future<> dirty_memory_manager::flush_one(memtable_list& mtlist, semaphore_units<> permit) {
-    if (mtlist.back()->empty()) {
-        return make_ready_future<>();
-    }
-
    auto* region = &(mtlist.back()->region());
    auto schema = mtlist.back()->schema();

@@ -3185,25 +3220,24 @@ future<mutation> database::apply_counter_update(schema_ptr s, const frozen_mutat
    }
 }

+static future<> maybe_handle_reorder(std::exception_ptr exp) {
+    try {
+        std::rethrow_exception(exp);
+        return make_exception_future(exp);
+    } catch (mutation_reordered_with_truncate_exception&) {
+        // This mutation raced with a truncate, so we can just drop it.
+        dblog.debug("replay_position reordering detected");
+        return make_ready_future<>();
+    }
+}
+
 future<> database::apply_with_commitlog(column_family& cf, const mutation& m, timeout_clock::time_point timeout) {
    if (cf.commitlog() != nullptr) {
        return do_with(freeze(m), [this, &m, &cf, timeout] (frozen_mutation& fm) {
            commitlog_entry_writer cew(m.schema(), fm);
            return cf.commitlog()->add_entry(m.schema()->id(), cew, timeout);
        }).then([this, &m, &cf, timeout] (db::rp_handle h) {
-            return apply_in_memory(m, cf, std::move(h), timeout).handle_exception([this, &cf, &m, timeout] (auto ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (replay_position_reordered_exception&) {
-                    // expensive, but we're assuming this is super rare.
-                    // if we failed to apply the mutation due to future re-ordering
-                    // (which should be the ever only reason for rp mismatch in CF)
-                    // let's just try again, add the mutation to the CL once more,
-                    // and assume success in inevitable eventually.
-                    dblog.debug("replay_position reordering detected");
-                    return this->apply_with_commitlog(cf, m, timeout);
-                }
-            });
+            return apply_in_memory(m, cf, std::move(h), timeout).handle_exception(maybe_handle_reorder);
        });
    }
    return apply_in_memory(m, cf, {}, timeout);
@@ -3214,19 +3248,7 @@ future<> database::apply_with_commitlog(schema_ptr s, column_family& cf, utils::
    if (cl != nullptr) {
        commitlog_entry_writer cew(s, m);
        return cf.commitlog()->add_entry(uuid, cew, timeout).then([&m, this, s, timeout, cl](db::rp_handle h) {
-            return this->apply_in_memory(m, s, std::move(h), timeout).handle_exception([this, s, &m, timeout] (auto ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (replay_position_reordered_exception&) {
-                    // expensive, but we're assuming this is super rare.
-                    // if we failed to apply the mutation due to future re-ordering
-                    // (which should be the ever only reason for rp mismatch in CF)
-                    // let's just try again, add the mutation to the CL once more,
-                    // and assume success in inevitable eventually.
-                    dblog.debug("replay_position reordering detected");
-                    return this->apply(s, m, timeout);
-                }
-            });
+            return this->apply_in_memory(m, s, std::move(h), timeout).handle_exception(maybe_handle_reorder);
        });
    }
    return apply_in_memory(m, std::move(s), {}, timeout);
@@ -3317,10 +3339,17 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        ++_stats->sstable_read_queue_overloaded;
        throw std::runtime_error("sstable inactive read queue overloaded");
    };
-    cfg.streaming_read_concurrency_config = cfg.read_concurrency_config;
-    cfg.streaming_read_concurrency_config.timeout = {};
+    // No timeouts or queue length limits - a failure here can kill an entire repair.
+    // Trust the caller to limit concurrency.
+    cfg.streaming_read_concurrency_config.sem = &_streaming_concurrency_sem;
    cfg.cf_stats = &_cf_stats;
    cfg.enable_incremental_backups = _enable_incremental_backups;
+
+    if (_cfg->background_writer_scheduling_quota() < 1.0f) {
+        cfg.background_writer_scheduling_group = &_background_writer_scheduling_group;
+        cfg.memtable_scheduling_group = _memtable_cpu_controller.scheduling_group();
+    }
+    cfg.enable_metrics_reporting = _cfg->enable_keyspace_column_family_metrics();
    return cfg;
 }

@@ -3444,10 +3473,10 @@ future<> database::truncate(sstring ksname, sstring cfname, timestamp_func tsf)
    return truncate(ks, cf, std::move(tsf));
 }

-future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_func tsf)
+future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_func tsf, bool with_snapshot)
 {
    const auto durable = ks.metadata()->durable_writes();
-    const auto auto_snapshot = get_config().auto_snapshot();
+    const auto auto_snapshot = with_snapshot && get_config().auto_snapshot();

    // Force mutations coming in to re-acquire higher rp:s
    // This creates a "soft" ordering, in that we will guarantee that
@@ -3774,35 +3803,6 @@ future<std::unordered_map<sstring, column_family::snapshot_details>> column_fami
 }

 future<> column_family::flush() {
-    _stats.pending_flushes++;
-
-    // highest_flushed_rp is only updated when we flush. If the memtable is currently alive, then
-    // the most up2date replay position is the one that's in there now. Otherwise, if the memtable
-    // hasn't received any writes yet, that's the one from the last flush we made.
-    auto desired_rp = _memtables->back()->empty() ? _highest_flushed_rp : _memtables->back()->replay_position();
-    return _memtables->request_flush().finally([this, desired_rp] {
-        _stats.pending_flushes--;
-        // In origin memtable_switch_count is incremented inside
-        // ColumnFamilyMeetrics Flush.run
-        _stats.memtable_switch_count++;
-        // wait for all up until us.
-        return _flush_queue->wait_for_pending(desired_rp);
-    });
-}
-
-future<> column_family::flush(const db::replay_position& pos) {
-    // Technically possible if we've already issued the
-    // sstable write, but it is not done yet.
-    if (pos < _highest_flushed_rp) {
-        return make_ready_future<>();
-    }
-
-    // TODO: Origin looks at "secondary" memtables
-    // It also consideres "minReplayPosition", which is simply where
-    // the CL "started" (the first ever RP in this run).
-    // We ignore this for now and just say that if we're asked for
-    // a CF and it exists, we pretty much have to have data that needs
-    // flushing. Let's do it.
    return _memtables->request_flush();
 }

@@ -3824,12 +3824,14 @@ future<> column_family::flush_streaming_mutations(utils::UUID plan_id, dht::part
            return _streaming_memtables->seal_active_memtable(memtable_list::flush_behavior::delayed).then([this] {
                return _streaming_flush_phaser.advance_and_await();
            }).then([this, sstables = std::move(sstables), ranges = std::move(ranges)] () mutable {
-                for (auto&& sst : sstables) {
-                    // seal_active_streaming_memtable_big() ensures sst is unshared.
-                    this->add_sstable(sst, {engine().cpu_id()});
-                }
-                this->trigger_compaction();
-                return _cache.invalidate(std::move(ranges));
+                return with_semaphore(_cache_update_sem, 1, [this, sstables = std::move(sstables), ranges = std::move(ranges)] () mutable {
+                    for (auto&& sst : sstables) {
+                        // seal_active_streaming_memtable_big() ensures sst is unshared.
+                        this->add_sstable(sst, {engine().cpu_id()});
+                    }
+                    this->trigger_compaction();
+                    return _cache.invalidate(std::move(ranges));
+                });
            });
        });
    });
@@ -4092,9 +4094,9 @@ column_family::cache_hit_rate column_family::get_hit_rate(gms::inet_address addr
    if (it == _cluster_cache_hit_rates.end()) {
        // no data yet, get it from the gossiper
        auto& gossiper = gms::get_local_gossiper();
-        auto eps = gossiper.get_endpoint_state_for_endpoint(addr);
+        auto* eps = gossiper.get_endpoint_state_for_endpoint_ptr(addr);
        if (eps) {
-            auto state = eps->get_application_state(gms::application_state::CACHE_HITRATES);
+            auto* state = eps->get_application_state_ptr(gms::application_state::CACHE_HITRATES);
            float f = -1.0f; // missing state means old node
            if (state) {
                sstring me = sprint("%s.%s", _schema->ks_name(), _schema->cf_name());
@@ -4119,11 +4121,12 @@ void column_family::drop_hit_rate(gms::inet_address addr) {
 }

 future<>
-write_memtable_to_sstable(memtable& mt, sstables::shared_sstable sst, bool backup, const io_priority_class& pc, bool leave_unsealed) {
+write_memtable_to_sstable(memtable& mt, sstables::shared_sstable sst, bool backup, const io_priority_class& pc, bool leave_unsealed, seastar::thread_scheduling_group *tsg) {
    sstables::sstable_writer_config cfg;
    cfg.replay_position = mt.replay_position();
    cfg.backup = backup;
    cfg.leave_unsealed = leave_unsealed;
+    cfg.thread_scheduling_group = tsg;
    return sst->write_components(mt.make_flush_reader(mt.schema(), pc), mt.partition_count(), mt.schema(), cfg, pc);
 }

--- a/database.hh
+++ b/database.hh
@@ -77,6 +77,8 @@
 #include <boost/intrusive/parent_from_member.hpp>
 #include "db/view/view.hh"
 #include "lister.hh"
+#include "utils/phased_barrier.hh"
+#include "cpu_controller.hh"

 class cell_locker;
 class cell_locker_stats;
@@ -114,7 +116,7 @@ void make(database& db, bool durable, bool volatile_testing_only);
 }
 }

-class replay_position_reordered_exception : public std::exception {};
+class mutation_reordered_with_truncate_exception : public std::exception {};

 using shared_memtable = lw_shared_ptr<memtable>;
 class memtable_list;
@@ -429,6 +431,9 @@ public:
        restricted_mutation_reader_config read_concurrency_config;
        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
+        seastar::thread_scheduling_group* background_writer_scheduling_group = nullptr;
+        seastar::thread_scheduling_group* memtable_scheduling_group = nullptr;
+        bool enable_metrics_reporting = false;
    };
    struct no_commitlog {};
    struct stats {
@@ -538,7 +543,6 @@ private:
    mutable row_cache _cache; // Cache covers only sstables.
    std::experimental::optional<int64_t> _sstable_generation = {};

-    db::replay_position _highest_flushed_rp;
    db::replay_position _highest_rp;
    db::replay_position _lowest_allowed_rp;

@@ -546,15 +550,7 @@ private:
    db::commitlog* _commitlog;
    compaction_manager& _compaction_manager;
    int _compaction_disabled = 0;
-    class memtable_flush_queue;
-    std::unique_ptr<memtable_flush_queue> _flush_queue;
-    // Because streaming mutations bypass the commitlog, there is
-    // no need for the complications of the flush queue. Besides, it
-    // is easier to just use a common gate than it is to modify the flush_queue
-    // to work both with and without a replay position.
-    //
-    // Last but not least, we seldom need to guarantee any ordering here: as long
-    // as all data is waited for, we're good.
+    utils::phased_barrier _flush_barrier;
    seastar::gate _streaming_flush_gate;
    std::vector<view_ptr> _views;
    semaphore _cache_update_sem{1};
@@ -753,7 +749,6 @@ public:
    void start();
    future<> stop();
    future<> flush();
-    future<> flush(const db::replay_position&);
    future<> flush_streaming_mutations(utils::UUID plan_id, dht::partition_range_vector ranges = dht::partition_range_vector{});
    future<> fail_streaming_mutations(utils::UUID plan_id);
    future<> clear(); // discards memtable(s) without flushing them to disk.
@@ -864,6 +859,10 @@ public:
        return _config.cf_stats;
    }

+    seastar::thread_scheduling_group* background_writer_scheduling_group() {
+        return _config.background_writer_scheduling_group;
+    }
+
    compaction_manager& get_compaction_manager() const {
        return _compaction_manager;
    }
@@ -1072,6 +1071,9 @@ public:
        restricted_mutation_reader_config read_concurrency_config;
        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
+        seastar::thread_scheduling_group* background_writer_scheduling_group = nullptr;
+        seastar::thread_scheduling_group* memtable_scheduling_group = nullptr;
+        bool enable_metrics_reporting = false;
    };
 private:
    std::unique_ptr<locator::abstract_replication_strategy> _replication_strategy;
@@ -1154,6 +1156,7 @@ public:
 private:
    ::cf_stats _cf_stats;
    static constexpr size_t max_concurrent_reads() { return 100; }
+    static constexpr size_t max_streaming_concurrent_reads() { return 10; } // They're rather heavyweight, so limit more
    static constexpr size_t max_system_concurrent_reads() { return 10; }
    static constexpr size_t max_concurrent_sstable_loads() { return 3; }
    struct db_stats {
@@ -1177,7 +1180,11 @@ private:
    dirty_memory_manager _dirty_memory_manager;
    dirty_memory_manager _streaming_dirty_memory_manager;

+    seastar::thread_scheduling_group _background_writer_scheduling_group;
+    flush_cpu_controller _memtable_cpu_controller;
+
    semaphore _read_concurrency_sem{max_concurrent_reads()};
+    semaphore _streaming_concurrency_sem{max_streaming_concurrent_reads()};
    restricted_mutation_reader_config _read_concurrency_config;
    semaphore _system_read_concurrency_sem{max_system_concurrent_reads()};
    restricted_mutation_reader_config _system_read_concurrency_config;
@@ -1332,10 +1339,10 @@ public:

    /** Truncates the given column family */
    future<> truncate(sstring ksname, sstring cfname, timestamp_func);
-    future<> truncate(const keyspace& ks, column_family& cf, timestamp_func);
+    future<> truncate(const keyspace& ks, column_family& cf, timestamp_func, bool with_snapshot = true);

    bool update_column_family(schema_ptr s);
-    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func);
+    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func, bool with_snapshot = true);
    void remove(const column_family&);

    const logalloc::region_group& dirty_memory_region_group() const {
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -84,9 +84,6 @@ public:
    // to be per shard and does no dispatching beyond delegating the the
    // shard qp (which is what you feed here).
    batchlog_manager(cql3::query_processor&);
-    batchlog_manager(distributed<cql3::query_processor>& qp)
-        : batchlog_manager(qp.local())
-    {}

    future<> start();
    future<> stop();
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -511,6 +511,7 @@ public:
        if (shutdown) {
            auto me = shared_from_this();
            return _gate.close().then([me] {
+                me->_closed = true;
                return me->sync().finally([me] {
                    // When we get here, nothing should add ops,
                    // and we should have waited out all pending.
@@ -1319,6 +1320,7 @@ future<> db::commitlog::segment_manager::shutdown() {
                return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true));
            });
        }).finally([this] {
+            discard_unused_segments();
            // Now that the gate is closed and requests completed we are sure nobody else will pop()
            return clear_reserve_segments().finally([this] {
                return std::move(_reserve_replenisher).then_wrapped([this] (auto f) {
--- a/db/config.hh
+++ b/db/config.hh
@@ -166,6 +166,12 @@ public:
     */

 #define _make_config_values(val)                \
+    val(background_writer_scheduling_quota, double, 1.0, Used, \
+            "max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5." \
+    )   \
+    val(auto_adjust_flush_quota, bool, false, Used, \
+            "true: auto-adjust quota for flush processes. false: put everyone together in the static background writer group - if background writer group is enabled. Not intended for setting in normal operations" \
+    )   \
    /* Initialization properties */             \
    /* The minimal properties needed for configuring a cluster. */  \
    val(cluster_name, sstring, "Test Cluster", Used,   \
@@ -330,7 +336,7 @@ public:
    val(sstable_preemptive_open_interval_in_mb, uint32_t, 50, Unused,     \
            "When compacting, the replacement opens SSTables before they are completely written and uses in place of the prior SSTables for any range previously written. This setting helps to smoothly transfer reads between the SSTables by reducing page cache churn and keeps hot rows hot."  \
    )                                                   \
-    val(defragment_memory_on_idle, bool, true, Used, "Set to true to defragment memory when the cpu is idle.  This reduces the amount of work Scylla performs when processing client requests.") \
+    val(defragment_memory_on_idle, bool, false, Used, "When set to true, will defragment memory when the cpu is idle.  This reduces the amount of work Scylla performs when processing client requests.") \
    /* Memtable settings */ \
    val(memtable_allocation_type, sstring, "heap_buffers", Invalid,     \
            "Specify the way Cassandra allocates and manages memtable memory. See Off-heap memtables in Cassandra 2.1. Options are:\n"  \
@@ -754,6 +760,9 @@ public:
    val(replace_address_first_boot, sstring, "", Used, "Like replace_address option, but if the node has been bootstrapped successfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.") \
    val(override_decommission, bool, false, Used, "Set true to force a decommissioned node to join the cluster") \
    val(ring_delay_ms, uint32_t, 30 * 1000, Used, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.") \
+    val(shadow_round_ms, uint32_t, 300 * 1000, Used, "The maximum gossip shadow round time. Can be used to reduce the gossip feature check time during node boot up.") \
+    val(fd_max_interval_ms, uint32_t, 2 * 1000, Used, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.") \
+    val(fd_initial_value_ms, uint32_t, 2 * 1000, Used, "The initial failure_detector interval time in milliseconds.") \
    val(shutdown_announce_in_ms, uint32_t, 2 * 1000, Used, "Time a node waits after sending gossip shutdown message in milliseconds. Same as -Dcassandra.shutdown_announce_in_ms in cassandra.") \
    val(developer_mode, bool, false, Used, "Relax environment checks. Setting to true can reduce performance and reliability significantly.") \
    val(skip_wait_for_gossip_to_settle, int32_t, -1, Used, "An integer to configure the wait for gossip to settle. -1: wait normally, 0: do not wait at all, n: wait for at most n polls. Same as -Dcassandra.skip_wait_for_gossip_to_settle in cassandra.") \
@@ -765,6 +774,7 @@ public:
    val(abort_on_lsa_bad_alloc, bool, false, Used, "Abort when allocation in LSA region fails") \
    val(murmur3_partitioner_ignore_msb_bits, unsigned, 0, Used, "Number of most siginificant token bits to ignore in murmur3 partitioner; increase for very large clusters") \
    val(virtual_dirty_soft_limit, double, 0.6, Used, "Soft limit of virtual dirty memory expressed as a portion of the hard limit") \
+    val(enable_keyspace_column_family_metrics, bool, false, Used, "Enable per keyspace and per column family metrics reporting") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
--- a/db/consistency_level.hh
+++ b/db/consistency_level.hh
@@ -162,6 +162,14 @@ inline void assure_sufficient_live_nodes(
        const PendingRange& pending_endpoints = std::array<gms::inet_address, 0>()) {
    size_t need = block_for(ks, cl);

+    auto adjust_live_for_error = [] (size_t live, size_t pending) {
+        // DowngradingConsistencyRetryPolicy uses alive replicas count from Unavailable
+        // exception to adjust CL for retry. When pending node is present CL is increased
+        // by 1 internally, so reported number of live nodes has to be adjusted to take
+        // this into account
+        return pending <= live ? live - pending : 0;
+    };
+
    switch (cl) {
    case consistency_level::ANY:
        // local hint is acceptable, and local node is always live
@@ -176,7 +184,7 @@ inline void assure_sufficient_live_nodes(
        size_t pending = count_local_endpoints(pending_endpoints);
        if (local_live < need + pending) {
            cl_logger.debug("Local replicas {} are insufficient to satisfy LOCAL_QUORUM requirement of needed {} and pending {}", live_endpoints, local_live, pending);
-            throw exceptions::unavailable_exception(cl, need, local_live);
+            throw exceptions::unavailable_exception(cl, need, adjust_live_for_error(local_live, pending));
        }
        break;
    }
@@ -190,7 +198,7 @@ inline void assure_sufficient_live_nodes(
        size_t pending = pending_endpoints.size();
        if (live < need + pending) {
            cl_logger.debug("Live nodes {} do not satisfy ConsistencyLevel ({} required, {} pending)", live, need, pending);
-            throw exceptions::unavailable_exception(cl, need, live);
+            throw exceptions::unavailable_exception(cl, need, adjust_live_for_error(live, pending));
        }
        break;
    }
--- a/db/legacy_schema_migrator.cc
+++ b/db/legacy_schema_migrator.cc
@@ -66,8 +66,8 @@ class migrator {
 public:
    static const std::unordered_set<sstring> legacy_schema_tables;

-    migrator(cql3::query_processor& qp)
-                    : _qp(qp) {
+    migrator(sharded<service::storage_proxy>& sp, cql3::query_processor& qp)
+                    : _sp(sp), _qp(qp) {
    }
    migrator(migrator&&) = default;

@@ -147,15 +147,18 @@ public:
        auto cq = fmt_query(fmt, db::system_keyspace::legacy::COLUMNS);
        auto zq = fmt_query(fmt, db::system_keyspace::legacy::TRIGGERS);

-        typedef std::tuple<future<result_set_type>, future<result_set_type>, future<result_set_type>> result_tuple;
+        typedef std::tuple<future<result_set_type>, future<result_set_type>, future<result_set_type>, future<db::schema_tables::legacy::schema_mutations>> result_tuple;

        return when_all(_qp.execute_internal(tq, { dst.name, cf_name }),
                        _qp.execute_internal(cq, { dst.name, cf_name }),
-                        _qp.execute_internal(zq, { dst.name, cf_name })).then([this, &dst, cf_name, timestamp](result_tuple&& t) {
+                        _qp.execute_internal(zq, { dst.name, cf_name }),
+                        db::schema_tables::legacy::read_table_mutations(_sp, dst.name, cf_name, db::system_keyspace::legacy::column_families()))
+                    .then([this, &dst, cf_name, timestamp](result_tuple&& t) {

            result_set_type tables = std::get<0>(t).get0();
            result_set_type columns = std::get<1>(t).get0();
            result_set_type triggers = std::get<2>(t).get0();
+            db::schema_tables::legacy::schema_mutations sm = std::get<3>(t).get0();

            row_type& td = tables->one();

@@ -165,6 +168,8 @@ public:

            schema_builder builder(dst.name, cf_name, id);

+            builder.with_version(sm.digest());
+
            cf_type cf = sstring_to_cf_type(td.get_or("type", sstring("standard")));
            if (cf == cf_type::super) {
                fail(unimplemented::cause::SUPER);
@@ -183,6 +188,7 @@ public:
                if (default_validator->is_counter()) {
                    builder.set_is_counter(true);
                }
+                builder.set_default_validation_class(default_validator);
            }

            /*
@@ -191,10 +197,8 @@ public:
             * but we can trust is_dense value of false.
             */
            auto is_dense = td.get_opt<bool>("is_dense");
-            if (is_dense && !*is_dense) {
-                builder.set_is_dense(false);
-            } else {
-                auto calulated_is_dense = [&] {
+            if (!is_dense || *is_dense) {
+                is_dense = [&] {
                    /*
                     * As said above, this method is only here because we need to deal with thrift upgrades.
                     * Once a CF has been "upgraded", i.e. we've rebuilt and save its CQL3 metadata at least once,
@@ -252,40 +256,48 @@ public:
                        return comparator.compare(off, end - off, utf8_type->name()) == 0;
                    };

-                    if (regular) {
-                        auto name = regular->get_or("column_name", bytes());
-                        // This is a lame attempt at determining if this was in fact a compact_value column
-                        if (!max_cl_idx || (!name.empty() && name != to_bytes("value"))
-                                        || db::schema_tables::parse_type(regular->get_as<sstring>("type")) != default_validator) {
-                            return false;
-                        }
-                        // Ok, we will assume this was in fact a (scylla-created) compact value.
-                    }
-
                    if (max_cl_idx) {
                        auto n = std::count(comparator.begin(), comparator.end(), ','); // num comp - 1
                        return *max_cl_idx == n;
                    }

+                    if (regular) {
+                        return false;
+                    }
+
                    return !is_cql3_only_pk_comparator(comparator);

                }();

-                builder.set_is_dense(calulated_is_dense);
-
                // now, if switched to sparse, remove redundant compact_value column and the last clustering column,
                // directly copying CASSANDRA-11502 logic. See CASSANDRA-11315.

-                filter_sparse = !calulated_is_dense && is_dense.value_or(true);
+                filter_sparse = !*is_dense;
            }
+            builder.set_is_dense(*is_dense);
+
+            auto is_cql = !*is_dense && is_compound;
+            auto is_static_compact = !*is_dense && !is_compound;
+
+            // org.apache.cassandra.schema.LegacySchemaMigrator#isEmptyCompactValueColumn
+            auto is_empty_compact_value = [](const cql3::untyped_result_set::row& column_row) {
+                auto kind_str = column_row.get_as<sstring>("type");
+                // Cassandra only checks for "compact_value", but Scylla generates "regular" instead (#2586)
+                return (kind_str == "compact_value" || kind_str == "regular")
+                       && column_row.get_as<sstring>("column_name").empty();
+            };

            for (auto& row : *columns) {
                auto kind_str = row.get_as<sstring>("type");
                auto kind = db::schema_tables::deserialize_kind(kind_str);
                auto component_index = kind > column_kind::clustering_key ? 0 : column_id(row.get_or("component_index", 0));
-                auto name = row.get_or("column_name", bytes());
+                auto name = row.get_or<sstring>("column_name", sstring());
                auto validator = db::schema_tables::parse_type(row.get_as<sstring>("validator"));

+                if (is_empty_compact_value(row)) {
+                    continue;
+                }
+
                if (filter_sparse) {
                    if (kind_str == "compact_value") {
                        continue;
@@ -329,7 +341,7 @@ public:
                            type = "VALUES";
                        }
                    }
-                    auto column = cql3::util::maybe_quote(utf8_type->to_string(name));
+                    auto column = cql3::util::maybe_quote(name);
                    options["target"] = validator->is_collection()
                                    ? type + "(" + column + ")"
                                    : column;
@@ -339,7 +351,26 @@ public:
                    builder.with_index(index_metadata(index_name, options, *index_kind));
                }

-                builder.with_column(std::move(name), std::move(validator), kind, component_index);
+                data_type column_name_type = [&] {
+                    if (is_static_compact && kind == column_kind::regular_column) {
+                        return db::schema_tables::parse_type(comparator);
+                    }
+                    return utf8_type;
+                }();
+                auto column_name = [&] {
+                    try {
+                        return column_name_type->from_string(name);
+                    } catch (marshal_exception) {
+                        // #2597: Scylla < 2.0 writes names in serialized form, try to recover
+                        column_name_type->validate(to_bytes_view(name));
+                        return to_bytes(name);
+                    }
+                }();
+                builder.with_column(std::move(column_name), std::move(validator), kind, component_index);
+            }
+
+            if (is_static_compact) {
+                builder.set_regular_column_name_type(db::schema_tables::parse_type(comparator));
            }

            if (td.has("read_repair_chance")) {
@@ -414,8 +445,6 @@ public:
                throw unsupported_feature("triggers");
            }

-            // TODO: table upgrades as in origin converter.
-
            dst.tables.emplace_back(table{timestamp, builder.build() });
        });
    }
@@ -517,21 +546,13 @@ public:
        });
    }

-    future<> unload_legacy_tables() {
-        return _qp.db().invoke_on_all([](database& db) {
-            for (auto& cfname : legacy_schema_tables) {
-                auto& cf = db.find_column_family(db::system_keyspace::NAME, cfname);
-                db.remove(cf);
-            }
-        });
-    }
-
-    future<> truncate_legacy_tables() {
-        mlogger.info("Truncating legacy schema tables");
-        return do_with(utils::make_joinpoint([] { return db_clock::now();}),[this](auto& tsf) {
-            return _qp.db().invoke_on_all([&tsf](database& db) {
-                return parallel_for_each(legacy_schema_tables, [&db, &tsf](const sstring& cfname) {
-                    return db.truncate(db::system_keyspace::NAME, cfname, [&tsf] { return tsf.value(); });
+    future<> drop_legacy_tables() {
+        mlogger.info("Dropping legacy schema tables");
+        return parallel_for_each(legacy_schema_tables, [this](const sstring& cfname) {
+            return do_with(utils::make_joinpoint([] { return db_clock::now();}),[this, cfname](auto& tsf) {
+                auto with_snapshot = !_keyspaces.empty();
+                return _qp.db().invoke_on_all([&tsf, cfname, with_snapshot](database& db) {
+                    return db.drop_column_family(db::system_keyspace::NAME, cfname, [&tsf] { return tsf.value(); }, with_snapshot);
                });
            });
        });
@@ -590,18 +611,15 @@ public:

    future<> migrate() {
        return read_all_keyspaces().then([this]() {
-            if (_keyspaces.empty()) {
-                return unload_legacy_tables();
-            }
            // write metadata to the new schema tables
            return store_keyspaces_in_new_schema_tables().then(std::bind(&migrator::migrate_indexes, this))
                                                .then(std::bind(&migrator::flush_schemas, this))
-                                                .then(std::bind(&migrator::truncate_legacy_tables, this))
-                                                .then(std::bind(&migrator::unload_legacy_tables, this))
+                                                .then(std::bind(&migrator::drop_legacy_tables, this))
                                                .then([] { mlogger.info("Completed migration of legacy schema tables"); });
        });
    }

+    sharded<service::storage_proxy>& _sp;
    cql3::query_processor& _qp;
    std::vector<keyspace> _keyspaces;
 };
@@ -620,7 +638,7 @@ const std::unordered_set<sstring> migrator::legacy_schema_tables = {
 }

 future<>
-db::legacy_schema_migrator::migrate(cql3::query_processor& qp) {
-    return do_with(migrator(qp), std::bind(&migrator::migrate, std::placeholders::_1));
+db::legacy_schema_migrator::migrate(sharded<service::storage_proxy>& sp, cql3::query_processor& qp) {
+    return do_with(migrator(sp, qp), std::bind(&migrator::migrate, std::placeholders::_1));
 }

--- a/db/legacy_schema_migrator.hh
+++ b/db/legacy_schema_migrator.hh
@@ -48,10 +48,14 @@ namespace cql3 {
 class query_processor;
 }

+namespace service {
+class storage_proxy;
+}
+
 namespace db {
 namespace legacy_schema_migrator {

-future<> migrate(cql3::query_processor&);
+future<> migrate(sharded<service::storage_proxy>&, cql3::query_processor&);

 }
 }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -64,7 +64,11 @@
 #include "db/config.hh"
 #include "md5_hasher.hh"

+#include <seastar/util/noncopyable_function.hh>
+
+#include <boost/algorithm/string/predicate.hpp>
 #include <boost/range/algorithm/copy.hpp>
+#include <boost/range/algorithm/transform.hpp>
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/join.hpp>

@@ -82,6 +86,8 @@ namespace schema_tables {

 logging::logger slogger("schema_tables");

+const sstring version = "3";
+
 struct push_back_and_return {
    std::vector<mutation> muts;

@@ -123,7 +129,11 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
    std::map<qualified_name, schema_mutations>&& views_before,
    std::map<qualified_name, schema_mutations>&& views_after);

-static void merge_types(distributed<service::storage_proxy>& proxy,
+struct user_types_to_drop final {
+    seastar::noncopyable_function<void()> drop;
+};
+
+static user_types_to_drop merge_types(distributed<service::storage_proxy>& proxy,
    schema_result&& before,
    schema_result&& after);

@@ -149,8 +159,8 @@ static void add_index_to_schema_mutation(schema_ptr table,
                const index_metadata& index, api::timestamp_type timestamp,
                mutation& mutation);

-static void drop_column_from_schema_mutation(schema_ptr,
-                const column_definition&, long timestamp,
+static void drop_column_from_schema_mutation(schema_ptr schema_table, schema_ptr table,
+                const sstring& column_name, long timestamp,
                std::vector<mutation>&);

 static void drop_index_from_schema_mutation(schema_ptr table,
@@ -165,13 +175,12 @@ static void prepare_builder_from_table_row(schema_builder&, const query::result_

 using namespace v3;

-std::vector<const char*> ALL { KEYSPACES, TABLES, COLUMNS, DROPPED_COLUMNS, TRIGGERS, VIEWS, TYPES, FUNCTIONS, AGGREGATES, INDEXES };
+std::vector<const char*> ALL { KEYSPACES, TABLES, SCYLLA_TABLES, COLUMNS, DROPPED_COLUMNS, TRIGGERS, VIEWS, TYPES, FUNCTIONS, AGGREGATES, INDEXES };

 using days = std::chrono::duration<int, std::ratio<24 * 3600>>;

-/** add entries to system.schema_* for the hardcoded system definitions */
-future<> save_system_keyspace_schema() {
-    auto& ks = db::qctx->db().find_keyspace(NAME);
+future<> save_system_schema(const sstring & ksname) {
+    auto& ks = db::qctx->db().find_keyspace(ksname);
    auto ksm = ks.metadata();

    // delete old, possibly obsolete entries in schema tables
@@ -185,6 +194,11 @@ future<> save_system_keyspace_schema() {
    });
 }

+/** add entries to system_schema.* for the hardcoded system definitions */
+future<> save_system_keyspace_schema() {
+    return save_system_schema(NAME);
+}
+
 namespace v3 {

 static constexpr auto schema_gc_grace = std::chrono::duration_cast<std::chrono::seconds>(days(7)).count();
@@ -256,6 +270,21 @@ schema_ptr tables() {
    return schema;
 }

+// Holds Scylla-specific table metadata.
+schema_ptr scylla_tables() {
+    static thread_local auto schema = [] {
+        auto id = generate_legacy_id(NAME, SCYLLA_TABLES);
+        return schema_builder(NAME, SCYLLA_TABLES, stdx::make_optional(id))
+            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
+            .with_column("table_name", utf8_type, column_kind::clustering_key)
+            .with_column("version", uuid_type)
+            .set_gc_grace_seconds(schema_gc_grace)
+            .with_version(generate_schema_version(id))
+            .build();
+    }();
+    return schema;
+}
+
 schema_ptr columns() {
    static thread_local auto schema = [] {
        schema_builder builder(make_lw_shared(::schema(generate_legacy_id(NAME, COLUMNS), NAME, COLUMNS,
@@ -519,7 +548,7 @@ future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>&
            for (auto&& p : rs->partitions()) {
                auto mut = p.mut().unfreeze(s);
                auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
-                if (partition_key == NAME) {
+                if (is_system_keyspace(partition_key)) {
                    continue;
                }
                mutations.emplace_back(std::move(mut));
@@ -552,7 +581,7 @@ future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<ser
            for (auto&& p : rs->partitions()) {
                auto mut = p.mut().unfreeze(s);
                auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
-                if (partition_key == NAME) {
+                if (is_system_keyspace(partition_key)) {
                    continue;
                }
                results.emplace_back(std::move(p.mut()));
@@ -727,6 +756,33 @@ read_tables_for_keyspaces(distributed<service::storage_proxy>& proxy, const std:
    return result;
 }

+mutation compact_for_schema_digest(const mutation& m) {
+    // Cassandra is skipping tombstones from digest calculation
+    // to avoid disagreements due to tombstone GC.
+    // See https://issues.apache.org/jira/browse/CASSANDRA-6862.
+    // We achieve similar effect with compact_for_compaction().
+    mutation m_compacted(m);
+    m_compacted.partition().compact_for_compaction(*m.schema(), always_gc, gc_clock::time_point::max());
+    return m_compacted;
+}
+
+// Applies deletion of the "version" column to a system_schema.scylla_tables mutation.
+static void delete_schema_version(mutation& m) {
+    if (m.column_family_id() != scylla_tables()->id()) {
+        return;
+    }
+    const column_definition& version_col = *scylla_tables()->get_column_definition(to_bytes("version"));
+    for (auto&& row : m.partition().clustered_rows()) {
+        auto&& cells = row.row().cells();
+        auto&& cell = cells.find_cell(version_col.id);
+        api::timestamp_type t = api::new_timestamp();
+        if (cell) {
+            t = std::max(t, cell->as_atomic_cell().timestamp());
+        }
+        cells.apply(version_col, atomic_cell::make_dead(t, gc_clock::now()));
+    }
+}
+
 static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
 {
   return seastar::async([&proxy, mutations = std::move(mutations), do_flush] () mutable {
@@ -737,6 +793,9 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
       for (auto&& mutation : mutations) {
           keyspaces.emplace(value_cast<sstring>(utf8_type->deserialize(mutation.key().get_component(*s, 0))));
           column_families.emplace(mutation.column_family_id());
+           // We must force recalculation of schema version after the merge, since the resulting
+           // schema may be a mix of the old and new schemas.
+           delete_schema_version(mutation);
       }

       // current state of the schema
@@ -749,6 +808,15 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
       /*auto& old_aggregates = */read_schema_for_keyspaces(proxy, AGGREGATES, keyspaces).get0();
 #endif

+       // Incoming mutations have the version field deleted. Delete here as well so that
+       // schemas which are otherwise equal don't appear as differing.
+       for (auto&& e : old_column_families) {
+           schema_mutations& sm = e.second;
+           if (sm.scylla_tables()) {
+               delete_schema_version(*sm.scylla_tables());
+           }
+       }
+
       proxy.local().mutate_locally(std::move(mutations)).get0();

       if (do_flush) {
@@ -771,7 +839,7 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
 #endif

       std::set<sstring> keyspaces_to_drop = merge_keyspaces(proxy, std::move(old_keyspaces), std::move(new_keyspaces)).get0();
-       merge_types(proxy, std::move(old_types), std::move(new_types));
+       auto types_to_drop = merge_types(proxy, std::move(old_types), std::move(new_types));
       merge_tables_and_views(proxy,
            std::move(old_column_families), std::move(new_column_families),
            std::move(old_views), std::move(new_views));
@@ -779,6 +847,8 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
       mergeFunctions(oldFunctions, newFunctions);
       mergeAggregates(oldAggregates, newAggregates);
 #endif
+       types_to_drop.drop();
+
       proxy.local().get_db().invoke_on_all([keyspaces_to_drop = std::move(keyspaces_to_drop)] (database& db) {
           // it is safe to drop a keyspace only when all nested ColumnFamilies where deleted
           return do_for_each(keyspaces_to_drop, [&db] (auto keyspace_to_drop) {
@@ -935,30 +1005,37 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
    }).get();
 }

-static inline void collect_types(std::set<sstring>& keys, schema_result& result, std::vector<user_type>& to)
+struct naked_user_type {
+    const sstring keyspace;
+    const sstring qualified_name;
+};
+
+static inline void collect_types(std::set<sstring>& keys, schema_result& result, std::vector<naked_user_type>& to)
 {
    for (auto&& key : keys) {
        auto&& value = result[key];
        auto types = create_types_from_schema_partition(schema_result_value_type{key, std::move(value)});
-        std::move(types.begin(), types.end(), std::back_inserter(to));
+        boost::transform(types, std::back_inserter(to), [] (user_type type) {
+            return naked_user_type{std::move(type->_keyspace), std::move(type->name())};
+        });
    }
 }

- // see the comments for merge_keyspaces()
-static void merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
+// see the comments for merge_keyspaces()
+static user_types_to_drop merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
 {
-    std::vector<user_type> created, altered, dropped;
+    std::vector<naked_user_type> created, altered, dropped;

    auto diff = difference(before, after, indirect_equal_to<lw_shared_ptr<query::result_set>>());

    collect_types(diff.entries_only_on_left, before, dropped); // Keyspaces with no more types
    collect_types(diff.entries_only_on_right, after, created); // New keyspaces with types

-    for (auto&& key : diff.entries_differing) {
+    for (auto&& keyspace : diff.entries_differing) {
        // The user types of this keyspace differ, so diff the current types with the updated ones
-        auto current_types = proxy.local().get_db().local().find_keyspace(key).metadata()->user_types()->get_all_types();
+        auto current_types = proxy.local().get_db().local().find_keyspace(keyspace).metadata()->user_types()->get_all_types();
        decltype(current_types) updated_types;
-        auto ts = create_types_from_schema_partition(schema_result_value_type{key, std::move(after[key])});
+        auto ts = create_types_from_schema_partition(schema_result_value_type{keyspace, std::move(after[keyspace])});
        updated_types.reserve(ts.size());
        for (auto&& type : ts) {
            updated_types[type->_name] = std::move(type);
@@ -966,36 +1043,46 @@ static void merge_types(distributed<service::storage_proxy>& proxy, schema_resul

        auto delta = difference(current_types, updated_types, indirect_equal_to<user_type>());

-        for (auto&& key : delta.entries_only_on_left) {
-            dropped.emplace_back(current_types[key]);
+        for (auto&& type_name : delta.entries_only_on_left) {
+            dropped.emplace_back(naked_user_type{keyspace, current_types[type_name]->name()});
        }
-        for (auto&& key : delta.entries_only_on_right) {
-            created.emplace_back(std::move(updated_types[key]));
+        for (auto&& type_name : delta.entries_only_on_right) {
+            created.emplace_back(naked_user_type{keyspace, updated_types[type_name]->name()});
        }
-        for (auto&& key : delta.entries_differing) {
-            altered.emplace_back(std::move(updated_types[key]));
+        for (auto&& type_name : delta.entries_differing) {
+            altered.emplace_back(naked_user_type{keyspace, updated_types[type_name]->name()});
        }
    }

-    proxy.local().get_db().invoke_on_all([&created, &dropped, &altered] (database& db) {
+    // Create and update user types before any tables/views are created that potentially
+    // use those types. Similarly, defer dropping until after tables/views that may use
+    // some of these user types are dropped.
+
+    proxy.local().get_db().invoke_on_all([&created, &altered] (database& db) {
        return seastar::async([&] {
            for (auto&& type : created) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type.qualified_name));
                db.find_keyspace(user_type->_keyspace).add_user_type(user_type);
                service::get_local_migration_manager().notify_create_user_type(user_type).get();
            }
-            for (auto&& type : dropped) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
-                db.find_keyspace(user_type->_keyspace).remove_user_type(user_type);
-                service::get_local_migration_manager().notify_drop_user_type(user_type).get();
-            }
            for (auto&& type : altered) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type.qualified_name));
                db.find_keyspace(user_type->_keyspace).add_user_type(user_type);
                service::get_local_migration_manager().notify_update_user_type(user_type).get();
            }
        });
    }).get();
+
+    return user_types_to_drop{[&proxy, dropped = std::move(dropped)] {
+        proxy.local().get_db().invoke_on_all([dropped = std::move(dropped)](database& db) {
+            return do_for_each(dropped, [&db](auto& user_type_to_drop) {
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(
+                        parse_type(std::move(user_type_to_drop.qualified_name)));
+                db.find_keyspace(user_type->_keyspace).remove_user_type(user_type);
+                return service::get_local_migration_manager().notify_drop_user_type(user_type);
+            });
+        }).get();
+    }};
 }

 #if 0
@@ -1387,7 +1474,7 @@ static void add_table_params_to_mutations(mutation& m, const clustering_key& cke

    {
        auto map = table->compaction_strategy_options();
-        map["class"] = sstables::compaction_strategy::name(table->compaction_strategy());
+        map["class"] = sstables::compaction_strategy::name(table->configured_compaction_strategy());
        store_map(m, ckey, "compaction", timestamp, map);
    }

@@ -1461,6 +1548,15 @@ static void add_dropped_column_to_schema_mutation(schema_ptr table, const sstrin
    m.set_clustered_cell(ckey, "type", expand_user_type(column.type)->as_cql3_type()->to_string(), timestamp);
 }

+mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type timestamp) {
+    schema_ptr s = tables();
+    auto pkey = partition_key::from_singular(*s, table->ks_name());
+    auto ckey = clustering_key::from_singular(*s, table->cf_name());
+    mutation m(pkey, scylla_tables());
+    m.set_clustered_cell(ckey, "version", utils::UUID(table->version()), timestamp);
+    return m;
+}
+
 static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_type timestamp, bool with_columns_and_triggers)
 {
    // When adding new schema properties, don't set cells for default values so that
@@ -1474,6 +1570,8 @@ static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_ty
    auto ckey = clustering_key::from_singular(*s, table->cf_name());
    m.set_clustered_cell(ckey, "id", table->id(), timestamp);

+    auto scylla_tables_mutation = make_scylla_tables_mutation(table, timestamp);
+
    {
        list_type_impl::native_type flags;
        if (table->is_super()) {
@@ -1499,7 +1597,7 @@ static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_ty
    mutation indices_mutation(pkey, indexes());

    if (with_columns_and_triggers) {
-        for (auto&& column : table->all_columns()) {
+        for (auto&& column : table->v3().all_columns()) {
            add_column_to_schema_mutation(table, column, timestamp, columns_mutation);
        }
        for (auto&& index : table->indices()) {
@@ -1512,7 +1610,8 @@ static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_ty
        }
    }

-    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation)};
+    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation),
+                            std::move(scylla_tables_mutation)};
 }

 void add_table_or_view_to_schema_mutation(schema_ptr s, api::timestamp_type timestamp, bool with_columns, std::vector<mutation>& mutations)
@@ -1561,23 +1660,23 @@ static void make_update_columns_mutations(schema_ptr old_table,
        std::vector<mutation>& mutations) {
    mutation columns_mutation(partition_key::from_singular(*columns(), old_table->ks_name()), columns());

-    auto diff = difference(old_table->columns_by_name(), new_table->columns_by_name());
+    auto diff = difference(old_table->v3().columns_by_name(), new_table->v3().columns_by_name());

    // columns that are no longer needed
    for (auto&& name : diff.entries_only_on_left) {
        // Thrift only knows about the REGULAR ColumnDefinition type, so don't consider other type
        // are being deleted just because they are not here.
-        const column_definition& column = *old_table->columns_by_name().at(name);
+        const column_definition& column = *old_table->v3().columns_by_name().at(name);
        if (from_thrift && !column.is_regular()) {
            continue;
        }

-        drop_column_from_schema_mutation(old_table, column, timestamp, mutations);
+        drop_column_from_schema_mutation(columns(), old_table, column.name_as_text(), timestamp, mutations);
    }

    // newly added columns and old columns with updated attributes
    for (auto&& name : boost::range::join(diff.entries_differing, diff.entries_only_on_right)) {
-        const column_definition& column = *new_table->columns_by_name().at(name);
+        const column_definition& column = *new_table->v3().columns_by_name().at(name);
        add_column_to_schema_mutation(new_table, column, timestamp, columns_mutation);
    }

@@ -1588,7 +1687,7 @@ static void make_update_columns_mutations(schema_ptr old_table,

    // newly dropped columns
    // columns added then dropped again
-    for (auto& name : dc_diff.entries_only_on_right) {
+    for (auto& name : boost::range::join(dc_diff.entries_differing, dc_diff.entries_only_on_right)) {
        add_drop_column_to_mutations(new_table, name, new_table->dropped_columns().at(name), timestamp, mutations);
    }
 }
@@ -1626,12 +1725,20 @@ static void make_drop_table_or_view_mutations(schema_ptr schema_table,
            api::timestamp_type timestamp,
            std::vector<mutation>& mutations) {
    auto pkey = partition_key::from_singular(*schema_table, table_or_view->ks_name());
-    mutation m{std::move(pkey), schema_table};
+    mutation m{pkey, schema_table};
    auto ckey = clustering_key::from_singular(*schema_table, table_or_view->cf_name());
-    m.partition().apply_delete(*schema_table, std::move(ckey), tombstone(timestamp, gc_clock::now()));
+    m.partition().apply_delete(*schema_table, ckey, tombstone(timestamp, gc_clock::now()));
    mutations.emplace_back(m);
-    for (auto &column : table_or_view->all_columns()) {
-        drop_column_from_schema_mutation(table_or_view, column, timestamp, mutations);
+    for (auto& column : table_or_view->v3().all_columns()) {
+        drop_column_from_schema_mutation(columns(), table_or_view, column.name_as_text(), timestamp, mutations);
+    }
+    for (auto& column : table_or_view->dropped_columns() | boost::adaptors::map_keys) {
+        drop_column_from_schema_mutation(dropped_columns(), table_or_view, column, timestamp, mutations);
+    }
+    {
+        mutation m{pkey, scylla_tables()};
+        m.partition().apply_delete(*scylla_tables(), ckey, tombstone(timestamp, gc_clock::now()));
+        mutations.emplace_back(m);
    }
 }

@@ -1655,17 +1762,14 @@ future<std::vector<mutation>> make_drop_table_mutations(lw_shared_ptr<keyspace_m

 static future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy, const qualified_name& table, schema_ptr s)
 {
-    return read_schema_partition_for_table(proxy, s, table.keyspace_name, table.table_name)
-        .then([&proxy, table] (mutation cf_m) {
-            return read_schema_partition_for_table(proxy, columns(), table.keyspace_name, table.table_name)
-                .then([&proxy, table, cf_m = std::move(cf_m)] (mutation col_m) {
-                return read_schema_partition_for_table(proxy, dropped_columns(), table.keyspace_name, table.table_name)
-                    .then([&proxy, table, cf_m = std::move(cf_m), col_m = std::move(col_m)] (mutation dropped_m) {
-                        return read_schema_partition_for_table(proxy, indexes(), table.keyspace_name, table.table_name)
-                            .then([cf_m = std::move(cf_m), col_m = std::move(col_m), dropped_m = std::move(dropped_m)] (mutation idx_m) {
-                                return schema_mutations{std::move(cf_m), std::move(col_m), std::move(idx_m), std::move(dropped_m)};
-                        });
-                    });
+    return when_all_succeed(
+        read_schema_partition_for_table(proxy, s, table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, columns(), table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, dropped_columns(), table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, indexes(), table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, scylla_tables(), table.keyspace_name, table.table_name)).then(
+            [] (mutation cf_m, mutation col_m, mutation dropped_m, mutation idx_m, mutation st_m) {
+                return schema_mutations{std::move(cf_m), std::move(col_m), std::move(idx_m), std::move(dropped_m), std::move(st_m)};
            });
 #if 0
        // FIXME:
@@ -1680,7 +1784,6 @@ static future<schema_mutations> read_table_mutations(distributed<service::storag
        throw new RuntimeException(e);
    }
 #endif
-    });
 }

 future<schema_ptr> create_table_from_name(distributed<service::storage_proxy>& proxy, const sstring& keyspace, const sstring& table)
@@ -1771,7 +1874,7 @@ static void prepare_builder_from_table_row(schema_builder& builder, const query:
            builder.set_min_compaction_threshold(std::stoi(map["min_threshold"]));
        }
        if (map.count("enabled")) {
-            // TODO: enable/disable?
+            builder.set_compaction_enabled(boost::algorithm::iequals(map["enabled"], "true"));
        }

        builder.set_compaction_strategy_options(map);
@@ -1870,13 +1973,12 @@ schema_ptr create_table_from_mutations(schema_mutations sm, std::experimental::o

    prepare_builder_from_table_row(builder, table_row);

-    for (auto&& cdef : column_defs) {
-        builder.with_column(cdef);
-    }
+    v3_columns columns(std::move(column_defs), is_dense, is_compound);
+    columns.apply_to(builder);

    std::vector<index_metadata> index_defs;
    if (sm.indices_mutation()) {
-        index_defs = create_indices_from_index_rows(query::result_set(sm.indices_mutation().value()), ks_name, cf_name);
+        index_defs = create_indices_from_index_rows(query::result_set(*sm.indices_mutation()), ks_name, cf_name);
    }
    for (auto&& index : index_defs) {
        builder.with_index(index);
@@ -1909,7 +2011,8 @@ static void add_column_to_schema_mutation(schema_ptr table,
                                   api::timestamp_type timestamp,
                                   mutation& m)
 {
-    auto ckey = clustering_key::from_exploded(*m.schema(), {utf8_type->decompose(table->cf_name()), column.name()});
+    auto ckey = clustering_key::from_exploded(*m.schema(), {utf8_type->decompose(table->cf_name()),
+                                                            utf8_type->decompose(column.name_as_text())});

    auto order = "NONE";
    if (column.is_clustering_key()) {
@@ -2003,13 +2106,19 @@ static void drop_index_from_schema_mutation(schema_ptr table, const index_metada
    mutations.push_back(std::move(m));
 }

-static void drop_column_from_schema_mutation(schema_ptr table, const column_definition& column, long timestamp, std::vector<mutation>& mutations) {
-    schema_ptr s = columns();
-    auto pkey = partition_key::from_singular(*s, table->ks_name());
-    auto ckey = clustering_key::from_exploded(*s, {utf8_type->decompose(table->cf_name()), column.name()});
+static void drop_column_from_schema_mutation(
+        schema_ptr schema_table,
+        schema_ptr table,
+        const sstring& column_name,
+        long timestamp,
+        std::vector<mutation>& mutations)
+{
+    auto pkey = partition_key::from_singular(*schema_table, table->ks_name());
+    auto ckey = clustering_key::from_exploded(*schema_table, {utf8_type->decompose(table->cf_name()),
+                                                              utf8_type->decompose(column_name)});

-    mutation m{pkey, s};
-    m.partition().apply_delete(*s, ckey, tombstone(timestamp, gc_clock::now()));
+    mutation m{pkey, schema_table};
+    m.partition().apply_delete(*schema_table, ckey, tombstone(timestamp, gc_clock::now()));
    mutations.emplace_back(m);
 }

@@ -2153,7 +2262,7 @@ static schema_mutations make_view_mutations(view_ptr view, api::timestamp_type t
    mutation indices_mutation(pkey, indexes());

    if (with_columns) {
-        for (auto&& column : view->all_columns()) {
+        for (auto&& column : view->v3().all_columns()) {
            add_column_to_schema_mutation(view, column, timestamp, columns_mutation);
        }

@@ -2165,7 +2274,10 @@ static schema_mutations make_view_mutations(view_ptr view, api::timestamp_type t
        }
    }

-    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation)};
+    auto scylla_tables_mutation = make_scylla_tables_mutation(view, timestamp);
+
+    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation),
+                            std::move(scylla_tables_mutation)};
 }

 schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timestamp, bool with_columns)
@@ -2459,10 +2571,33 @@ data_type parse_type(sstring str)

 std::vector<schema_ptr> all_tables() {
    return {
-        keyspaces(), tables(), columns(), dropped_columns(), triggers(),
+        keyspaces(), tables(), scylla_tables(), columns(), dropped_columns(), triggers(),
        views(), indexes(), types(), functions(), aggregates(),
    };
 }

+namespace legacy {
+
+table_schema_version schema_mutations::digest() const {
+    md5_hasher h;
+    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies);
+    db::schema_tables::feed_hash_for_schema_digest(h, _columns);
+    return utils::UUID_gen::get_name_UUID(h.finalize());
+}
+
+future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy,
+    sstring keyspace_name, sstring table_name, schema_ptr s)
+{
+    return read_schema_partition_for_table(proxy, s, keyspace_name, table_name)
+        .then([&proxy, keyspace_name, table_name] (mutation cf_m) {
+            return read_schema_partition_for_table(proxy, db::system_keyspace::legacy::columns(), keyspace_name, table_name)
+                .then([cf_m = std::move(cf_m)] (mutation col_m) {
+                    return schema_mutations{std::move(cf_m), std::move(col_m)};
+                });
+        });
+}
+
+} // namespace legacy
+
 } // namespace schema_tables
 } // namespace schema
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -64,6 +64,7 @@ namespace v3 {
 static constexpr auto NAME = "system_schema";
 static constexpr auto KEYSPACES = "keyspaces";
 static constexpr auto TABLES = "tables";
+static constexpr auto SCYLLA_TABLES = "scylla_tables";
 static constexpr auto COLUMNS = "columns";
 static constexpr auto DROPPED_COLUMNS = "dropped_columns";
 static constexpr auto TRIGGERS = "triggers";
@@ -77,16 +78,43 @@ schema_ptr columns();
 schema_ptr dropped_columns();
 schema_ptr indexes();
 schema_ptr tables();
+schema_ptr scylla_tables();
 schema_ptr views();

 }

+namespace legacy {
+
+class schema_mutations {
+    mutation _columnfamilies;
+    mutation _columns;
+public:
+    schema_mutations(mutation columnfamilies, mutation columns)
+        : _columnfamilies(std::move(columnfamilies))
+        , _columns(std::move(columns))
+    { }
+    table_schema_version digest() const;
+};
+
+future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy,
+    sstring keyspace_name, sstring table_name, schema_ptr s);
+
+}
+
 using namespace v3;

+// Change on non-backwards compatible changes of schema mutations.
+// Replication of schema between nodes with different version is inhibited.
+extern const sstring version;
+
 extern std::vector<const char*> ALL;

 std::vector<schema_ptr> all_tables();

+// saves/creates "ks" + all tables etc, while first deleting all old schema entries (will be rewritten)
+future<> save_system_schema(const sstring & ks);
+
+// saves/creates "system_schema" keyspace
 future<> save_system_keyspace_schema();

 future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>& proxy);
@@ -137,6 +165,7 @@ view_ptr create_view_from_mutations(schema_mutations, std::experimental::optiona
 future<std::vector<view_ptr>> create_views_from_schema_partition(distributed<service::storage_proxy>& proxy, const schema_result::mapped_type& result);

 schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timestamp, bool with_columns);
+mutation make_scylla_tables_mutation(schema_ptr, api::timestamp_type timestamp);

 void add_table_or_view_to_schema_mutation(schema_ptr view, api::timestamp_type timestamp, bool with_columns, std::vector<mutation>& mutations);

@@ -153,15 +182,11 @@ data_type parse_type(sstring str);
 sstring serialize_index_kind(index_metadata_kind kind);
 index_metadata_kind deserialize_index_kind(sstring kind);

+mutation compact_for_schema_digest(const mutation& m);
+
 template<typename Hasher>
 void feed_hash_for_schema_digest(Hasher& h, const mutation& m) {
-    // Cassandra is skipping tombstones from digest calculation
-    // to avoid disagreements due to tombstone GC.
-    // See https://issues.apache.org/jira/browse/CASSANDRA-6862.
-    // We achieve similar effect with compact_for_compaction().
-    mutation m_compacted(m);
-    m_compacted.partition().compact_for_compaction(*m.schema(), always_gc, gc_clock::time_point::max());
-    feed_hash(h, m_compacted);
+    feed_hash(h, compact_for_schema_digest(m));
 }

 } // namespace schema_tables
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1044,6 +1044,9 @@ future<> setup(distributed<database>& db, distributed<cql3::query_processor>& qp
        return check_health();
    }).then([] {
        return db::schema_tables::save_system_keyspace_schema();
+    }).then([] {
+        // #2514 - make sure "system" is written to system_schema.keyspaces.
+        return db::schema_tables::save_system_schema(NAME);
    }).then([] {
        return netw::get_messaging_service().invoke_on_all([] (auto& ms){
            return ms.init_local_preferred_ip_cache();
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -62,6 +62,8 @@ namespace cql3 {
    class query_processor;
 }

+bool is_system_keyspace(const sstring& ks_name);
+
 namespace db {
 namespace system_keyspace {

@@ -120,6 +122,18 @@ extern schema_ptr hints();
 extern schema_ptr batchlog();
 extern schema_ptr built_indexes(); // TODO (from Cassandra): make private

+namespace legacy {
+
+schema_ptr keyspaces();
+schema_ptr column_families();
+schema_ptr columns();
+schema_ptr triggers();
+schema_ptr usertypes();
+schema_ptr functions();
+schema_ptr aggregates();
+
+}
+
 table_schema_version generate_schema_version(utils::UUID table_id);

 // Only for testing.
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -194,13 +194,13 @@ public:
            : _view(std::move(view))
            , _view_info(*_view->view_info())
            , _base(std::move(base))
-            , _updates(8, partition_key::hashing(*_base), partition_key::equality(*_base)) {
+            , _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
    }

    void move_to(std::vector<mutation>& mutations) && {
        auto& partitioner = dht::global_partitioner();
        std::transform(_updates.begin(), _updates.end(), std::back_inserter(mutations), [&, this] (auto&& m) {
-            return mutation(_view, partitioner.decorate_key(*_base, std::move(m.first)), std::move(m.second));
+            return mutation(_view, partitioner.decorate_key(*_view, std::move(m.first)), std::move(m.second));
        });
    }

--- a/dht/boot_strapper.cc
+++ b/dht/boot_strapper.cc
@@ -59,14 +59,11 @@ future<> boot_strapper::bootstrap() {
        streamer->add_ranges(keyspace_name, ranges);
    }

-    return streamer->fetch_async().then_wrapped([streamer] (auto&& f) {
-        try {
-            auto state = f.get0();
-        } catch (...) {
-            throw std::runtime_error(sprint("Error during boostrap: %s", std::current_exception()));
-        }
+    return streamer->stream_async().then([streamer] () {
        service::get_local_storage_service().finish_bootstrapping();
-        return make_ready_future<>();
+    }).handle_exception([streamer] (std::exception_ptr eptr) {
+        blogger.warn("Eror during bootstrap: {}", eptr);
+        return make_exception_future<>(std::move(eptr));
    });
 }

--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -260,6 +260,27 @@ unsigned shard_of(const token& t) {
    return global_partitioner().shard_of(t);
 }

+stdx::optional<dht::token_range>
+selective_token_range_sharder::next() {
+    if (_done) {
+        return {};
+    }
+    while (_range.overlaps(dht::token_range(_start_boundary, {}), dht::token_comparator())
+            && !(_start_boundary && _start_boundary->value() == maximum_token())) {
+        auto end_token = _partitioner.token_for_next_shard(_start_token, _next_shard);
+        auto candidate = dht::token_range(std::move(_start_boundary), range_bound<dht::token>(end_token, false));
+        auto intersection = _range.intersection(std::move(candidate), dht::token_comparator());
+        _start_token = _partitioner.token_for_next_shard(end_token, _shard);
+        _start_boundary = range_bound<dht::token>(_start_token);
+        if (intersection) {
+            return *intersection;
+        }
+    }
+
+    _done = true;
+    return {};
+}
+
 stdx::optional<ring_position_range_and_shard>
 ring_position_range_sharder::next(const schema& s) {
    if (_done) {
@@ -462,14 +483,13 @@ int ring_position_comparator::operator()(ring_position_view lh, ring_position_vi
    }
 }

-int ring_position_comparator::operator()(ring_position_view lh, sstables::key_view rh) const {
-    auto rh_token = global_partitioner().get_token(rh);
-    auto token_cmp = tri_compare(*lh._token, rh_token);
+int ring_position_comparator::operator()(ring_position_view lh, sstables::decorated_key_view rh) const {
+    auto token_cmp = tri_compare(*lh._token, rh.token());
    if (token_cmp) {
        return token_cmp;
    }
    if (lh._key) {
-        auto rel = rh.tri_compare(s, *lh._key);
+        auto rel = rh.key().tri_compare(s, *lh._key);
        if (rel) {
            return -rel;
        }
@@ -477,7 +497,7 @@ int ring_position_comparator::operator()(ring_position_view lh, sstables::key_vi
    return lh._weight;
 }

-int ring_position_comparator::operator()(sstables::key_view a, ring_position_view b) const {
+int ring_position_comparator::operator()(sstables::decorated_key_view a, ring_position_view b) const {
    return -(*this)(b, a);
 }

--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -55,6 +55,7 @@
 namespace sstables {

 class key_view;
+class decorated_key_view;

 }

@@ -547,8 +548,8 @@ struct ring_position_comparator {
    const schema& s;
    ring_position_comparator(const schema& s_) : s(s_) {}
    int operator()(ring_position_view, ring_position_view) const;
-    int operator()(ring_position_view, sstables::key_view) const;
-    int operator()(sstables::key_view, ring_position_view) const;
+    int operator()(ring_position_view, sstables::decorated_key_view) const;
+    int operator()(sstables::decorated_key_view, ring_position_view) const;
 };

 // "less" comparator giving the same order as ring_position_comparator
@@ -671,6 +672,29 @@ split_ranges_to_shards(const dht::token_range_vector& ranges, const schema& s);
 std::vector<partition_range> split_range_to_single_shard(const schema& s, const dht::partition_range& pr, shard_id shard);
 std::vector<partition_range> split_range_to_single_shard(const i_partitioner& partitioner, const schema& s, const dht::partition_range& pr, shard_id shard);

+class selective_token_range_sharder {
+    const i_partitioner& _partitioner;
+    dht::token_range _range;
+    shard_id _shard;
+    bool _done = false;
+    shard_id _next_shard;
+    dht::token _start_token;
+    stdx::optional<range_bound<dht::token>> _start_boundary;
+public:
+    explicit selective_token_range_sharder(dht::token_range range, shard_id shard)
+            : selective_token_range_sharder(global_partitioner(), std::move(range), shard) {}
+    selective_token_range_sharder(const i_partitioner& partitioner, dht::token_range range, shard_id shard)
+            : _partitioner(partitioner)
+            , _range(std::move(range))
+            , _shard(shard)
+            , _next_shard(_shard + 1 == _partitioner.shard_count() ? 0 : _shard + 1)
+            , _start_token(_range.start() ? _range.start()->value() : minimum_token())
+            , _start_boundary(_partitioner.shard_of(_start_token) == shard ?
+                _range.start() : range_bound<dht::token>(_partitioner.token_for_next_shard(_start_token, shard))) {
+    }
+    stdx::optional<dht::token_range> next();
+};
+
 } // dht

 namespace std {
--- a/dht/range_streamer.cc
+++ b/dht/range_streamer.cc
@@ -193,8 +193,7 @@ range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_n

        inet_address source_ip = range_sources.find(desired_range)->second;
        auto& gossiper = gms::get_local_gossiper();
-        auto source_state = gossiper.get_endpoint_state_for_endpoint(source_ip);
-        if (gossiper.is_enabled() && source_state && !source_state->is_alive()) {
+        if (gossiper.is_enabled() && !gossiper.is_alive(source_ip)) {
            throw std::runtime_error(sprint("A node required to move the data consistently is down (%s).  If you wish to move the data from a potentially inconsistent replica, restart the node with consistent_rangemovement=false", source_ip));
        }
    }
@@ -211,7 +210,36 @@ bool range_streamer::use_strict_sources_for_ranges(const sstring& keyspace_name)
           && _metadata.get_all_endpoints().size() != strat.get_replication_factor();
 }

+void range_streamer::add_tx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families) {
+    if (_nr_rx_added) {
+        throw std::runtime_error("Mixed sending and receiving is not supported");
+    }
+    _nr_tx_added++;
+    _to_stream.emplace(keyspace_name, std::move(ranges_per_endpoint));
+    auto inserted = _column_families.emplace(keyspace_name, std::move(column_families)).second;
+    if (!inserted) {
+        throw std::runtime_error("Can not add column_families for the same keyspace more than once");
+    }
+}
+
+void range_streamer::add_rx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families) {
+    if (_nr_tx_added) {
+        throw std::runtime_error("Mixed sending and receiving is not supported");
+    }
+    _nr_rx_added++;
+    _to_stream.emplace(keyspace_name, std::move(ranges_per_endpoint));
+    auto inserted = _column_families.emplace(keyspace_name, std::move(column_families)).second;
+    if (!inserted) {
+        throw std::runtime_error("Can not add column_families for the same keyspace more than once");
+    }
+}
+
+// TODO: This is the legacy range_streamer interface, it is add_rx_ranges which adds rx ranges.
 void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges) {
+    if (_nr_tx_added) {
+        throw std::runtime_error("Mixed sending and receiving is not supported");
+    }
+    _nr_rx_added++;
    auto ranges_for_keyspace = use_strict_sources_for_ranges(keyspace_name)
        ? get_all_ranges_with_strict_sources_for(keyspace_name, ranges)
        : get_all_ranges_with_sources_for(keyspace_name, ranges);
@@ -232,26 +260,114 @@ void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_v
            logger.debug("{} : range {} from source {} for keyspace {}", _description, x.second, x.first, keyspace_name);
        }
    }
-    _to_fetch.emplace(keyspace_name, std::move(range_fetch_map));
+    _to_stream.emplace(keyspace_name, std::move(range_fetch_map));
 }

-future<streaming::stream_state> range_streamer::fetch_async() {
-    for (auto& fetch : _to_fetch) {
-        const auto& keyspace = fetch.first;
-        for (auto& x : fetch.second) {
-            auto& source = x.first;
-            auto& ranges = x.second;
-            /* Send messages to respective folks to stream data over to me */
-            if (logger.is_enabled(logging::log_level::debug)) {
-                logger.debug("{}ing from {} ranges {}", _description, source, ranges);
+future<> range_streamer::stream_async() {
+    return seastar::async([this] {
+        int sleep_time = 60;
+        for (;;) {
+            try {
+                do_stream_async().get();
+                break;
+            } catch (...) {
+                logger.warn("{} failed to stream. Will retry in {} seconds ...", _description, sleep_time);
+                sleep_abortable(std::chrono::seconds(sleep_time)).get();
+                sleep_time *= 1.5;
+                if (++_nr_retried >= _nr_max_retry) {
+                    throw;
+                }
            }
-            _stream_plan.request_ranges(source, keyspace, ranges);
+        }
+    });
+}
+
+future<> range_streamer::do_stream_async() {
+    auto nr_ranges_remaining = nr_ranges_to_stream();
+    logger.info("{} starts, nr_ranges_remaining={}", _description, nr_ranges_remaining);
+    auto start = lowres_clock::now();
+    return do_for_each(_to_stream, [this, start, description = _description] (auto& stream) {
+        const auto& keyspace = stream.first;
+        auto& ip_range_vec = stream.second;
+        // Fetch from or send to peer node in parallel
+        return parallel_for_each(ip_range_vec, [this, description, keyspace] (auto& ip_range) {
+            auto& source = ip_range.first;
+            auto& range_vec = ip_range.second;
+            return seastar::async([this, description, keyspace, source, &range_vec] () mutable {
+                // TODO: It is better to use fiber instead of thread here because
+                // creating a thread per peer can be some memory in a large cluster.
+                auto start_time = lowres_clock::now();
+                unsigned sp_index = 0;
+                unsigned nr_ranges_streamed = 0;
+                size_t nr_ranges_total = range_vec.size();
+                size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
+                dht::token_range_vector ranges_to_stream;
+                auto do_streaming = [&] {
+                    auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++));
+                    logger.info("{} with {} for keyspace={}, {} out of {} ranges: ranges = {}",
+                            description, source, keyspace, nr_ranges_streamed, nr_ranges_total, ranges_to_stream.size());
+                    if (_nr_rx_added) {
+                        sp.request_ranges(source, keyspace, ranges_to_stream, _column_families[keyspace]);
+                    } else if (_nr_tx_added) {
+                        sp.transfer_ranges(source, keyspace, ranges_to_stream, _column_families[keyspace]);
+                    }
+                    sp.execute().discard_result().get();
+                    ranges_to_stream.clear();
+                };
+                try {
+                    for (auto it = range_vec.begin(); it < range_vec.end();) {
+                        ranges_to_stream.push_back(*it);
+                        it = range_vec.erase(it);
+                        nr_ranges_streamed++;
+                        if (ranges_to_stream.size() < nr_ranges_per_stream_plan) {
+                            continue;
+                        } else {
+                            do_streaming();
+                        }
+                    }
+                    if (ranges_to_stream.size() > 0) {
+                        do_streaming();
+                    }
+                } catch (...) {
+                    for (auto& range : ranges_to_stream) {
+                        range_vec.push_back(range);
+                    }
+                    auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
+                    logger.warn("{} with {} for keyspace={} failed, took {} seconds: {}", description, source, keyspace, t, std::current_exception());
+                    throw;
+                }
+                auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
+                logger.info("{} with {} for keyspace={} succeeded, took {} seconds", description, source, keyspace, t);
+            });
+
+        });
+    }).finally([this, start] {
+        auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start).count();
+        auto nr_ranges_remaining = nr_ranges_to_stream();
+        if (nr_ranges_remaining) {
+            logger.warn("{} failed, took {} seconds, nr_ranges_remaining={}", _description, t, nr_ranges_remaining);
+        } else {
+            logger.info("{} succeeded, took {} seconds, nr_ranges_remaining={}", _description, t, nr_ranges_remaining);
+        }
+    });
+}
+
+size_t range_streamer::nr_ranges_to_stream() {
+    size_t nr_ranges_remaining = 0;
+    for (auto& fetch : _to_stream) {
+        const auto& keyspace = fetch.first;
+        auto& ip_range_vec = fetch.second;
+        for (auto& ip_range : ip_range_vec) {
+            auto& source = ip_range.first;
+            auto& range_vec = ip_range.second;
+            nr_ranges_remaining += range_vec.size();
+            logger.debug("Remaining: keyspace={}, source={}, ranges={}", keyspace, source, range_vec);
        }
    }
-
-    return _stream_plan.execute();
+    return nr_ranges_remaining;
 }

+
 std::unordered_multimap<inet_address, dht::token_range>
 range_streamer::get_work_map(const std::unordered_multimap<dht::token_range, inet_address>& ranges_with_source_target,
             const sstring& keyspace) {
--- a/dht/range_streamer.hh
+++ b/dht/range_streamer.hh
@@ -119,6 +119,8 @@ public:
    }

    void add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges);
+    void add_tx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families = {});
+    void add_rx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families = {});
 private:
    bool use_strict_sources_for_ranges(const sstring& keyspace_name);
    /**
@@ -159,16 +161,25 @@ public:
    }
 #endif
 public:
-    future<streaming::stream_state> fetch_async();
+    future<> stream_async();
+    future<> do_stream_async();
+    size_t nr_ranges_to_stream();
 private:
    distributed<database>& _db;
    token_metadata& _metadata;
    std::unordered_set<token> _tokens;
    inet_address _address;
    sstring _description;
-    std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_fetch;
+    std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_stream;
    std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
    stream_plan _stream_plan;
+    std::unordered_map<sstring, std::vector<sstring>> _column_families;
+    // Retry the stream plan _nr_max_retry times
+    unsigned _nr_retried = 0;
+    unsigned _nr_max_retry = 5;
+    // Number of tx and rx ranges added
+    unsigned _nr_tx_added = 0;
+    unsigned _nr_rx_added = 0;
 };

 } // dht
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -79,13 +79,14 @@ if [ $LOCALRPM -eq 1 ]; then
            cd ../..
            cp build/scylla-jmx/build/rpms/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
        fi
-        if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ]; then
+        if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ] || [ ! -f dist/ami/files/scylla-tools-core.noarch.rpm ]; then
            cd build
            git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
            cd scylla-tools-java
            sh -x -e dist/redhat/build_rpm.sh
            cd ../..
            cp build/scylla-tools-java/build/rpms/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
+            cp build/scylla-tools-java/build/rpms/scylla-tools-core-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools-core.noarch.rpm
        fi
    else
        sudo apt-get install -y git
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/modprobe.d/scylla-raid0.conf
+++ b/dist/common/modprobe.d/scylla-raid0.conf
@@ -1 +0,0 @@
-options raid0 devices_discard_performance=Y
--- a/dist/common/scripts/node_health_check
+++ b/dist/common/scripts/node_health_check
@@ -75,13 +75,16 @@ while getopts ":hdncap:q:" opt; do
 done


-##Check if server is Fedora/Debian release##
-cat /etc/os-release | grep fedora &> /dev/null
+##Check server release (Fedora/Oracle/Debian)##
+cat /etc/os-release | grep -i fedora &> /dev/null
 if [ $? -ne 0 ]; then
-    IS_FEDORA="1"
+    cat /etc/os-release | grep -i oracle &> /dev/null
+    if [ $? -ne 0 ]; then
+        IS_FEDORA="1"
+    fi
 fi

-cat /etc/os-release | grep debian &> /dev/null
+cat /etc/os-release | grep -i debian &> /dev/null
 if [ $? -ne 0 ]; then
    IS_DEBIAN="1"
 fi
@@ -91,25 +94,24 @@ if [ "$IS_FEDORA" == "1" ] && [ "$IS_DEBIAN" == "1" ]; then
    exit 222
 fi

-##Pass criteria for script execution##
-#Check scylla service#
+##Scylla-server service status##
 echo "--------------------------------------------------"
-echo "Checking Scylla Service"
+echo "Checking Scylla-server Service"
 echo "--------------------------------------------------"

 ps -C scylla --no-headers &> /dev/null
 if [ $? -ne 0 ]; then
    SCYLLA_SERVICE="1"
-    echo "ERROR: Scylla is NOT Running"
+    echo "ERROR: Scylla-server is NOT Running"
    echo "Cannot Collect Data Model Info"
    echo "--------------------------------------------------"
 else
-    echo "Scylla Service: OK"
+    echo "Scylla-server Service: OK"
    echo "--------------------------------------------------"
 fi 


-#Check Scylla-JMX service#
+##Scylla-JMX service status##
 echo "Checking Scylla-JMX Service on Port $JMX_PORT"
 echo "--------------------------------------------------"

@@ -121,7 +123,7 @@ if [ $? -ne 0 ]; then
    echo "Use the '-p' Option to Provide the Scylla-JMX Port"
    echo "--------------------------------------------------"
 else
-    echo "JMX Service (nodetool): OK"
+    echo "Scylla-JMX Service (nodetool): OK"
    echo "--------------------------------------------------"
 fi 

@@ -152,12 +154,12 @@ mkdir -p $OUTPUT_PATH1 $OUTPUT_PATH2 $OUTPUT_PATH3 $OUTPUT_PATH4 $OUTPUT_PATH5
 #System Checks#
 echo "Collecting System Info"
 echo "--------------------------------------------------"
-cat /etc/os-release > $OUTPUT_PATH1/os-release.txt
+cp -p /etc/os-release $OUTPUT_PATH1
 uname -r > $OUTPUT_PATH1/kernel-release.txt
 lscpu > $OUTPUT_PATH1/cpu-info.txt
 vmstat -s -S M | awk '{$1=$1};1' > $OUTPUT_PATH1/vmstat.txt
 df -Th > $OUTPUT_PATH1/capacity-info.txt && echo "" >> $OUTPUT_PATH1/capacity-info.txt && sudo du -sh /var/lib/scylla/* >> $OUTPUT_PATH1/capacity-info.txt
-cat /proc/mdstat > $OUTPUT_PATH1/raid-conf.txt
+cp -p /proc/mdstat $OUTPUT_PATH1
 for f in `sudo find /sys -name scheduler`; do echo -n "$f: "; cat  $f; done > $OUTPUT_PATH1/io-sched-conf.txt && echo "" >> $OUTPUT_PATH1/io-sched-conf.txt
 for f in `sudo find /sys -name nomerges`; do echo -n "$f: "; cat  $f; done >> $OUTPUT_PATH1/io-sched-conf.txt

@@ -166,30 +168,23 @@ for f in `sudo find /sys -name nomerges`; do echo -n "$f: "; cat  $f; done >> $O
 echo "Collecting Scylla Info"
 echo "--------------------------------------------------"

+scylla --version > $OUTPUT_PATH2/scylla-version.txt
+cp -p /etc/scylla/* $OUTPUT_PATH2
+ls -ltrh /var/lib/scylla/coredump/ > $OUTPUT_PATH2/coredump-folder.txt
+
 if [ "$IS_FEDORA" == "0" ]; then
    rpm -qa | grep -i scylla > $OUTPUT_PATH2/scylla-pkgs.txt
+    cp -p /etc/sysconfig/scylla-server $OUTPUT_PATH2
 fi

 if [ "$IS_DEBIAN" == "0" ]; then
    dpkg -l | grep -i scylla > $OUTPUT_PATH2/scylla-pkgs.txt
+    cp -p /etc/default/scylla-server $OUTPUT_PATH2
 fi

-curl -s -X GET "http://localhost:10000/storage_service/scylla_release_version" > $OUTPUT_PATH2/scylla-version.txt && echo "" >> $OUTPUT_PATH2/scylla-version.txt
-cat /etc/scylla/scylla.yaml | grep -v "#" | grep -v "^[[:space:]]*$" > $OUTPUT_PATH2/scylla-yaml.txt
-
-if [ "$IS_FEDORA" == "0" ]; then
-    cat /etc/sysconfig/scylla-server | grep -v "^[[:space:]]*$" > $OUTPUT_PATH2/scylla-server.txt
-fi
-
-if [ "$IS_DEBIAN" == "0" ]; then
-    cat /etc/default/scylla-server | grep -v "^[[:space:]]*$" > $OUTPUT_PATH2/scylla-server.txt
-fi
-
-cat /etc/scylla/cassandra-rackdc.properties | grep -v "#" |grep -v "^[[:space:]]*$" > $OUTPUT_PATH2/multi-DC.txt
-ls -ltrh /var/lib/scylla/coredump/ > $OUTPUT_PATH2/coredump-folder.txt
-

 #Scylla Logs#
+echo "--------------------------------------------------"
 echo "Collecting Logs"
 echo "--------------------------------------------------"

@@ -256,7 +251,7 @@ for i in `ls -I lo /sys/class/net/`; do echo "--$i"; cat /sys/class/net/$i/queue
 for i in `ls -I lo /sys/class/net/`; do echo "--$i"; cat /sys/class/net/$i/queues/rx-*/rps_flow_cnt; echo ""; done > $OUTPUT_PATH5/rfs-conf.txt
 ps -elf | grep irqbalance > $OUTPUT_PATH5/irqbalance-conf.txt
 sudo sysctl -a > $OUTPUT_PATH5/sysctl.txt 2>&1
-sudo iptables -L > $OUTPUT_PATH5/iptables.txt
+sudo iptables -L -v > $OUTPUT_PATH5/iptables.txt
 netstat -an | grep tcp > $OUTPUT_PATH5/netstat-tcp.txt


@@ -297,7 +292,7 @@ echo "" >> $REPORT

 echo "Host Operating System" >> $REPORT
 echo "---------------------" >> $REPORT
-cat $OUTPUT_PATH1/os-release.txt >> $REPORT
+cat $OUTPUT_PATH1/os-release >> $REPORT
 echo "" >> $REPORT
 echo "" >> $REPORT

@@ -327,7 +322,7 @@ echo "" >> $REPORT

 echo "RAID Configuration" >> $REPORT
 echo "------------------" >> $REPORT
-cat $OUTPUT_PATH1/raid-conf.txt >> $REPORT
+cat $OUTPUT_PATH1/mdstat >> $REPORT
 echo "" >> $REPORT
 echo "" >> $REPORT

@@ -354,7 +349,7 @@ echo "" >> $REPORT
 echo "Configuration files" >> $REPORT
 echo "-------------------" >> $REPORT
 echo "## /etc/scylla/scylla.yaml ##" >> $REPORT
-cat $OUTPUT_PATH2/scylla-yaml.txt >> $REPORT
+cat $OUTPUT_PATH2/scylla.yaml | grep -v "#" | grep -v "^[[:space:]]*$" >> $REPORT
 echo "" >> $REPORT
 echo "" >> $REPORT

@@ -366,11 +361,11 @@ if [ "$IS_DEBIAN" == "0" ]; then
    echo "## /etc/default/scylla-server ##" >> $REPORT
 fi

-cat $OUTPUT_PATH2/scylla-server.txt >> $REPORT
+cat $OUTPUT_PATH2/scylla-server | grep -v "^[[:space:]]*$" >> $REPORT
 echo "" >> $REPORT
 echo "" >> $REPORT
 echo "## /etc/scylla/cassandra-rackdc.properties ##" >> $REPORT
-cat $OUTPUT_PATH2/multi-DC.txt >> $REPORT
+cat $OUTPUT_PATH2/cassandra-rackdc.properties | grep -v "#" |grep -v "^[[:space:]]*$" >> $REPORT
 echo "" >> $REPORT
 echo "" >> $REPORT

--- a/dist/common/scripts/scylla_cpuscaling_setup
+++ b/dist/common/scripts/scylla_cpuscaling_setup
@@ -4,6 +4,10 @@

 . /usr/lib/scylla/scylla_lib.sh

+if [ ! -f /sys/devices/system/cpu/cpufreq/policy0/scaling_governor ]; then
+    echo "This computer doesn't supported CPU scaling configuration."
+    exit 0
+fi
 if is_debian_variant; then
    apt-get install -y cpufrequtils
    service cpufrequtils stop
--- a/dist/common/scripts/scylla_cpuset_setup
+++ b/dist/common/scripts/scylla_cpuset_setup
@@ -2,6 +2,8 @@
 #
 #  Copyright (C) 2016 ScyllaDB

+. /usr/lib/scylla/scylla_lib.sh
+
 print_usage() {
    echo "scylla_cpuset_setup --cpuset 1-7 --smp 7"
    echo "  --cpuset   CPUs to use (in cpuset(7) format; default: all))"
@@ -38,5 +40,6 @@ fi
 if [ "$SMP" != "" ]; then
    OUT="$OUT--smp $SMP "
 fi
+rm -f /etc/scylla.d/perftune.yaml
 OUT="$OUT\""
 echo $OUT > /etc/scylla.d/cpuset.conf
--- a/dist/common/scripts/scylla_lib.sh
+++ b/dist/common/scripts/scylla_lib.sh
@@ -38,6 +38,51 @@ ec2_is_supported_instance_type() {
    esac
 }

+#
+# check_cpuset_conf <NIC name>
+#
+get_tune_mode() {
+    local nic=$1
+
+    # if cpuset.conf doesn't exist use the default mode
+    [[ ! -e '/etc/scylla.d/cpuset.conf' ]] && return
+
+    local cur_cpuset=`cat /etc/scylla.d/cpuset.conf | cut -d "\"" -f2- | cut -d" " -f2`
+    local mq_cpuset=`/usr/lib/scylla/perftune.py --tune net --nic "$nic" --mode mq --get-cpu-mask | /usr/lib/scylla/hex2list.py`
+    local sq_cpuset=`/usr/lib/scylla/perftune.py --tune net --nic "$nic" --mode sq --get-cpu-mask | /usr/lib/scylla/hex2list.py`
+    local sq_split_cpuset=`/usr/lib/scylla/perftune.py --tune net --nic "$nic" --mode sq_split --get-cpu-mask | /usr/lib/scylla/hex2list.py`
+    local tune_mode=""
+
+    case "$cur_cpuset" in
+        "$mq_cpuset")
+            tune_mode="--mode mq"
+            ;;
+        "$sq_cpuset")
+            tune_mode="--mode sq"
+            ;;
+        "$sq_split_cpuset")
+            tune_mode="--mode sq_split"
+            ;;
+    esac
+
+    # if cpuset is something different from what we expect - use the default mode
+    echo "$tune_mode"
+}
+
+#
+# create_perftune_conf [<NIC name>]
+#
+create_perftune_conf() {
+    local nic=$1
+    [[ -z "$nic" ]] && nic='eth0'
+
+    # if exists - do nothing
+    [[ -e '/etc/scylla.d/perftune.yaml' ]] && return
+
+    local mode=`get_tune_mode "$nic"`
+    /usr/lib/scylla/perftune.py --tune net --nic "$nic" $mode --dump-options-file > /etc/scylla.d/perftune.yaml
+}
+
 . /etc/os-release
 if is_debian_variant || is_gentoo_variant; then
    SYSCONFIG=/etc/default
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -22,7 +22,8 @@ elif [ "$NETWORK_MODE" = "dpdk" ]; then
    done
 else # NETWORK_MODE = posix
    if [ "$SET_NIC" = "yes" ]; then
-        /usr/lib/scylla/posix_net_conf.sh $IFNAME
+        create_perftune_conf "$IFNAME"
+        /usr/lib/scylla/posix_net_conf.sh $IFNAME --options-file /etc/scylla.d/perftune.yaml
    fi
 fi
 if [ "$ID" = "ubuntu" ]; then
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -104,7 +104,11 @@ else
    mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
    mkfs.xfs $RAID -f -K
 fi
-mdadm --detail --scan > /etc/mdadm.conf
+if is_debian_variant; then
+    mdadm --detail --scan > /etc/mdadm/mdadm.conf
+else
+    mdadm --detail --scan > /etc/mdadm.conf
+fi

 mkdir -p "$MOUNT_AT"
 mount -t xfs -o noatime $RAID "$MOUNT_AT"
@@ -122,3 +126,7 @@ if [ $FSTAB -ne 0 ]; then
    UUID=`blkid $RAID | awk '{print $2}'`
    echo "$UUID $MOUNT_AT xfs noatime 0 0" >> /etc/fstab
 fi
+
+if is_debian_variant; then
+    update-initramfs -u
+fi
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -75,7 +75,7 @@ verify_package() {
    if is_debian_variant; then
        dpkg -s $1 > /dev/null 2>&1 &&:
    elif is_gentoo_variant; then
-        find /var/db/pkg/dev-db -type d -name "${1}-*" | egrep -q ".*"
+        find /var/db/pkg/app-admin -type d -name "${1}-*" | egrep -q ".*"
    else
        rpm -q $1 > /dev/null 2>&1 &&:
    fi
--- a/dist/common/systemd/scylla-housekeeping-daily.service.in
+++ b/dist/common/systemd/scylla-housekeeping-daily.service.in
@@ -6,7 +6,7 @@ After=network.target
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/yum.repos.d/scylla*.repo' -q -c /etc/scylla.d/housekeeping.cfg version --mode d
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode d

 [Install]
 WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-housekeeping-restart.service.in
+++ b/dist/common/systemd/scylla-housekeeping-restart.service.in
@@ -6,7 +6,7 @@ After=network.target
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q --repo-files '/etc/yum.repos.d/scylla*.repo' -c /etc/scylla.d/housekeeping.cfg version --mode r
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode r

 [Install]
 WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-server.service.in
+++ b/dist/common/systemd/scylla-server.service.in
@@ -1,6 +1,6 @@
 [Unit]
 Description=Scylla Server
-After=network.target
+After=network-online.target
 Wants=scylla-jmx.service
 Wants=scylla-housekeeping-restart.timer
 Wants=scylla-housekeeping-daily.timer
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -129,9 +129,11 @@ sed -i -e "s/@@CODENAME@@/$TARGET/g" debian/changelog
 cp dist/debian/rules.in debian/rules
 cp dist/debian/control.in debian/control
 cp dist/debian/scylla-server.install.in debian/scylla-server.install
+cp dist/debian/scylla-conf.preinst.in debian/scylla-conf.preinst
+sed -i -e "s/@@VERSION@@/$SCYLLA_VERSION/g" debian/scylla-conf.preinst
 if [ "$TARGET" = "jessie" ]; then
    cp dist/debian/scylla-server.cron.d debian/
-    sed -i -e "s/@@REVISION@@/1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++-5, libunwind-dev/g" debian/control
@@ -145,7 +147,7 @@ if [ "$TARGET" = "jessie" ]; then
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
 elif [ "$TARGET" = "stretch" ] || [ "$TARGET" = "buster" ] || [ "$TARGET" = "sid" ]; then
    cp dist/debian/scylla-server.cron.d debian/
-    sed -i -e "s/@@REVISION@@/1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind8-dev/g" debian/control
@@ -159,7 +161,7 @@ elif [ "$TARGET" = "stretch" ] || [ "$TARGET" = "buster" ] || [ "$TARGET" = "sid
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
 elif [ "$TARGET" = "trusty" ]; then
    cp dist/debian/scylla-server.cron.d debian/
-    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@/--upstart-only/g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/g++-5, libunwind8-dev/g" debian/control
@@ -172,7 +174,7 @@ elif [ "$TARGET" = "trusty" ]; then
    sed -i -e "s#@@SCRIPTS_FSTRIM@@#dist/debian/scripts/scylla_fstrim usr/lib/scylla#g" debian/scylla-server.install
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
 elif [ "$TARGET" = "xenial" ] || [ "$TARGET" = "yakkety" ] || [ "$TARGET" = "zesty" ] || [ "$TARGET" = "artful" ]; then
-    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind-dev/g" debian/control
@@ -194,8 +196,10 @@ else
 fi
 cp dist/common/systemd/scylla-server.service.in debian/scylla-server.service
 sed -i -e "s#@@SYSCONFDIR@@#/etc/default#g" debian/scylla-server.service
-cp dist/common/systemd/scylla-housekeeping-daily.service debian/scylla-server.scylla-housekeeping-daily.service
-cp dist/common/systemd/scylla-housekeeping-restart.service debian/scylla-server.scylla-housekeeping-restart.service
+cp dist/common/systemd/scylla-housekeeping-daily.service.in debian/scylla-server.scylla-housekeeping-daily.service
+sed -i -e "s#@@REPOFILES@@#'/etc/apt/sources.list.d/scylla*.list'#g" debian/scylla-server.scylla-housekeeping-daily.service
+cp dist/common/systemd/scylla-housekeeping-restart.service.in debian/scylla-server.scylla-housekeeping-restart.service
+sed -i -e "s#@@REPOFILES@@#'/etc/apt/sources.list.d/scylla*.list'#g" debian/scylla-server.scylla-housekeeping-restart.service
 cp dist/common/systemd/node-exporter.service debian/scylla-server.node-exporter.service

 if [ $REBUILD -eq 1 ]; then
--- a/dist/debian/control.in
+++ b/dist/debian/control.in
@@ -40,7 +40,7 @@ Description: Scylla kernel tuning configuration
 Package: scylla
 Section: metapackages
 Architecture: any
-Depends: scylla-server, scylla-jmx, scylla-tools, scylla-kernel-conf
+Depends: scylla-server, scylla-jmx, scylla-tools, scylla-tools-core, scylla-kernel-conf
 Description: Scylla database metapackage
 Scylla is a highly scalable, eventually consistent, distributed,
 partitioned row DB.
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -7,7 +7,8 @@ KVER=$(uname -r)
 if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
    echo "kernel $KVER detected, skip running sysctl..."
 else
-    sysctl -p/etc/sysctl.d/99-scylla-sched.conf
+    # expect failures in virtualized environments
+    sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
 fi

 #DEBHELPER#
--- a/dist/debian/debian/scylla-server.postinst
+++ b/dist/debian/debian/scylla-server.postinst
@@ -3,12 +3,22 @@
 set -e

 if [ "$1" = configure ]; then
-    adduser --system \
-            --quiet \
-            --home /var/lib/scylla \
-            --no-create-home \
-            --disabled-password \
-            --group scylla
+    getent passwd scylla || NOUSR=1
+    getent group scylla || NOGRP=1
+
+    # this handles both case group is not exist || group already exists
+    if [ $NOUSR ]; then
+        adduser --system \
+                --quiet \
+                --home /var/lib/scylla \
+                --no-create-home \
+                --disabled-password \
+                --group scylla
+    # only group is not exist, create it and add user to the group
+    elif [ $NOGRP ]; then
+        addgroup --system scylla
+        adduser scylla scylla
+    fi
    chown -R scylla:scylla /var/lib/scylla
    chown -R scylla:scylla /var/lib/scylla-housekeeping
 fi
--- a/dist/debian/scylla-conf.preinst.in
+++ b/dist/debian/scylla-conf.preinst.in
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+ver=$(dpkg -l|grep scylla-server|awk '{print $3}'|sed -e "s/-.*$//")
+if [ -n "$ver" ]; then
+    ver_fmt=$(echo $ver | awk -F. '{printf "%d%02d%02d", $1,$2,$3}')
+    if [ $ver_fmt -lt 10703 ]; then
+        # for <scylla-1.2
+        if [ ! -f /usr/lib/scylla/scylla_config_get.py ]; then
+            echo
+            echo "Error: Upgrading from scylla-$ver to scylla-@@VERSION@@ is not supported."
+            echo "Please upgrade to scylla-1.7.3 or later, before upgrade to @@VERSION@@."
+            echo
+            exit 1
+        fi
+        commitlog_directory=$(/usr/lib/scylla/scylla_config_get.py -g commitlog_directory)
+        commitlog_files=$(ls $commitlog_directory | wc -l)
+        if [ $commitlog_files -ne 0 ]; then
+            echo
+            echo "Error: Upgrading from scylla-$ver to scylla-@@VERSION@@ is not supported when commitlog is not clean."
+            echo "Please upgrade to scylla-1.7.3 or later, before upgrade to @@VERSION@@."
+            echo "Also make sure $commitlog_directory is empty."
+            echo
+            exit 1
+        fi
+    fi
+fi
+
+#DEBHELPER#
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -7,7 +7,7 @@ ENV container docker
 VOLUME [ "/sys/fs/cgroup" ]

 #install scylla
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-2.0.repo -o /etc/yum.repos.d/scylla.repo
 RUN yum -y install epel-release
 RUN yum -y clean expire-cache
 RUN yum -y update
--- a/dist/docker/redhat/scyllasetup.py
+++ b/dist/docker/redhat/scyllasetup.py
@@ -70,5 +70,7 @@ class ScyllaSetup:
        if self._experimental == "1":
            args += [ "--experimental=on" ]

+        args += ["--blocked-reactor-notify-ms 999999999"]
+
        with open("/etc/scylla.d/docker.conf", "w") as cqlshrc:
            cqlshrc.write("SCYLLA_DOCKER_ARGS=\"%s\"\n" % " ".join(args))
--- a/dist/redhat/build_rpm.sh
+++ b/dist/redhat/build_rpm.sh
@@ -104,9 +104,9 @@ fi


 if [ $JOBS -gt 0 ]; then
-    SRPM_OPTS="$SRPM_OPTS --define='_smp_mflags -j$JOBS'"
+    RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
 fi
-sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS
+sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
 if [ "$TARGET" = "epel-7-x86_64" ] && [ $REBUILD = 1 ]; then
    ./dist/redhat/centos_dep/build_dependency.sh
    sudo mock --init --root=$TARGET
@@ -116,4 +116,4 @@ elif [ "$TARGET" = "epel-7-x86_64" ] && [ $REBUILD = 0 ]; then
    TARGET=scylla-$TARGET
    RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
 fi
-sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS build/srpms/scylla-$VERSION*.src.rpm
+sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/scylla-$VERSION*.src.rpm
--- a/dist/redhat/centos_dep/binutils.diff
+++ b/dist/redhat/centos_dep/binutils.diff
@@ -33,8 +33,8 @@
 Requires(post): coreutils
 -Requires(post): %{_sbindir}/alternatives
 -Requires(preun): %{_sbindir}/alternatives
-+Requires(post): /sbin/alternatives
-+Requires(preun): /sbin/alternatives
+Requires(post): /usr/sbin/alternatives
+Requires(preun): /usr/sbin/alternatives
 %endif
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
@@ -58,13 +58,13 @@
 %if "%{build_gold}" == "both"
 %__rm -f %{_bindir}/%{?cross}ld
 -%{_sbindir}/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
-+/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
+/usr/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
   %{_bindir}/%{?cross}ld.bfd %{ld_bfd_priority}
 -%{_sbindir}/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
-+/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
+/usr/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
   %{_bindir}/%{?cross}ld.gold %{ld_gold_priority}
 -%{_sbindir}/alternatives --auto %{?cross}ld 
-+/sbin/alternatives --auto %{?cross}ld 
+/usr/sbin/alternatives --auto %{?cross}ld 
 %endif
 %if %{isnative}
 /sbin/ldconfig
@@ -74,8 +74,8 @@
 if [ $1 = 0 ]; then
 -  %{_sbindir}/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.bfd
 -  %{_sbindir}/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.gold
-+  /sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.bfd
-+  /sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.gold
+  /usr/sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.bfd
+  /usr/sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.gold
 fi
 %endif
 %if %{isnative}
--- a/dist/redhat/mock/scylla-epel-7-x86_64.cfg
+++ b/dist/redhat/mock/scylla-epel-7-x86_64.cfg
@@ -71,13 +71,13 @@ enabled=0

 [scylla-3rdparty]
 name=Scylla 3rdParty for Centos $releasever - $basearch
-baseurl=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/3rdparty/7/x86_64/
+baseurl=http://downloads.scylladb.com/rpm/3rdparty/centos/scylladb-2.0/$releasever/$basearch/
 enabled=1
 gpgcheck=0

 [scylla-3rdparty-generic]
 name=Scylla 3rdParty for Centos $releasever
-baseurl=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/3rdparty/7/noarch/
+baseurl=http://downloads.scylladb.com/rpm/3rdparty/centos/scylladb-2.0/$releasever/noarch/
 enabled=1
 gpgcheck=0
 """
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -7,14 +7,14 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{name}-@@VERSION@@-@@RELEASE@@.tar
-Requires:       scylla-server scylla-jmx scylla-tools scylla-kernel-conf
+Requires:       scylla-server = @@VERSION@@ scylla-jmx = @@VERSION@@ scylla-tools = @@VERSION@@ scylla-tools-core = @@VERSION@@ scylla-kernel-conf = @@VERSION@@
 Obsoletes:	scylla-server < 1.1

 %description
 Scylla is a highly scalable, eventually consistent, distributed,
 partitioned row DB.
 This package installs all required packages for ScyllaDB,  including
-scylla-server, scylla-jmx, scylla-tools.
+scylla-server, scylla-jmx, scylla-tools, scylla-tools-core.

 # this is needed to prevent python compilation error on CentOS (#2235)
 %if 0%{?rhel}
@@ -78,6 +78,10 @@ python3.4 ./configure.py --enable-dpdk --mode=release --static-stdc++ --static-b
 ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
 cp dist/common/systemd/scylla-server.service.in build/scylla-server.service
 sed -i -e "s#@@SYSCONFDIR@@#/etc/sysconfig#g" build/scylla-server.service
+cp dist/common/systemd/scylla-housekeeping-restart.service.in build/scylla-housekeeping-restart.service
+sed -i -e "s#@@REPOFILES@@#'/etc/yum.repos.d/scylla*.repo'#g" build/scylla-housekeeping-restart.service
+cp dist/common/systemd/scylla-housekeeping-daily.service.in build/scylla-housekeeping-daily.service
+sed -i -e "s#@@REPOFILES@@#'/etc/yum.repos.d/scylla*.repo'#g" build/scylla-housekeeping-daily.service

 %install
 rm -rf $RPM_BUILD_ROOT
@@ -88,9 +92,6 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
-%if 0%{?rhel}
-mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
-%endif
 mkdir -p $RPM_BUILD_ROOT%{_sysctldir}/
 mkdir -p $RPM_BUILD_ROOT%{_docdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_unitdir}
@@ -101,9 +102,6 @@ install -m644 dist/common/limits.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/sec
 install -m644 dist/common/collectd.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 install -m644 dist/common/scylla.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
 install -m644 dist/common/sysctl.d/*.conf $RPM_BUILD_ROOT%{_sysctldir}/
-%if 0%{?rhel}
-install -m644 dist/common/modprobe.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
-%endif
 install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
 install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
@@ -267,18 +265,9 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
-# Write modprobe.d params when module already loaded
-%if 0%{?rhel}
-if [ -e /sys/module/raid0/parameters/devices_discard_performance ]; then
-    echo Y > /sys/module/raid0/parameters/devices_discard_performance
-fi
-%endif

 %files kernel-conf
 %defattr(-,root,root)
-%if 0%{?rhel}
-%config(noreplace) %{_sysconfdir}/modprobe.d/*.conf
-%endif
 %{_sysctldir}/*.conf

 %changelog
--- a/gms/application_state.cc
+++ b/gms/application_state.cc
@@ -62,6 +62,7 @@ static const std::map<application_state, sstring> application_state_names = {
    {application_state::TOKENS,                 "TOKENS"},
    {application_state::SUPPORTED_FEATURES,     "SUPPORTED_FEATURES"},
    {application_state::CACHE_HITRATES,         "CACHE_HITRATES"},
+    {application_state::SCHEMA_TABLES_VERSION,  "SCHEMA_TABLES_VERSION"},
 };

 std::ostream& operator<<(std::ostream& os, const application_state& m) {
--- a/gms/application_state.hh
+++ b/gms/application_state.hh
@@ -59,8 +59,8 @@ enum class application_state {
    TOKENS,
    SUPPORTED_FEATURES,
    CACHE_HITRATES,
+    SCHEMA_TABLES_VERSION,
    // pad to allow adding new states to existing cluster
-    X3,
    X4,
    X5,
    X6,
--- a/gms/endpoint_state.cc
+++ b/gms/endpoint_state.cc
@@ -42,12 +42,12 @@

 namespace gms {

-std::experimental::optional<versioned_value> endpoint_state::get_application_state(application_state key) const {
+const versioned_value* endpoint_state::get_application_state_ptr(application_state key) const {
    auto it = _application_state.find(key);
    if (it == _application_state.end()) {
-        return {};
+        return nullptr;
    } else {
-        return _application_state.at(key);
+        return &it->second;
    }
 }

--- a/gms/endpoint_state.hh
+++ b/gms/endpoint_state.hh
@@ -43,6 +43,8 @@
 #include "gms/heart_beat_state.hh"
 #include "gms/application_state.hh"
 #include "gms/versioned_value.hh"
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string/classification.hpp>
 #include <experimental/optional>
 #include <chrono>

@@ -54,7 +56,7 @@ namespace gms {
 */
 class endpoint_state {
 public:
-    using clk = std::chrono::system_clock;
+    using clk = seastar::lowres_system_clock;
 private:
    heart_beat_state _heart_beat_state;
    std::map<application_state, versioned_value> _application_state;
@@ -89,10 +91,12 @@ public:
        , _is_alive(true) {
    }

+    // Valid only on shard 0
    heart_beat_state& get_heart_beat_state() {
        return _heart_beat_state;
    }

+    // Valid only on shard 0
    const heart_beat_state& get_heart_beat_state() const {
        return _heart_beat_state;
    }
@@ -102,7 +106,7 @@ public:
        _heart_beat_state = hbs;
    }

-    std::experimental::optional<versioned_value> get_application_state(application_state key) const;
+    const versioned_value* get_application_state_ptr(application_state key) const;

    /**
     * TODO replace this with operations that don't expose private state
@@ -117,18 +121,36 @@ public:
    }

    void add_application_state(application_state key, versioned_value value) {
-        if (_application_state.count(key)) {
-            _application_state.at(key) = value;
-        } else {
-            _application_state.emplace(key, value);
+        _application_state[key] = std::move(value);
+    }
+
+    void apply_application_state(application_state key, versioned_value&& value) {
+        auto&& e = _application_state[key];
+        if (e.version < value.version) {
+            e = std::move(value);
+        }
+    }
+
+    void apply_application_state(application_state key, const versioned_value& value) {
+        auto&& e = _application_state[key];
+        if (e.version < value.version) {
+            e = value;
+        }
+    }
+
+    void apply_application_state(const endpoint_state& es) {
+        for (auto&& e : es._application_state) {
+            apply_application_state(e.first, e.second);
        }
    }

    /* getters and setters */
    /**
     * @return System.nanoTime() when state was updated last time.
+     *
+     * Valid only on shard 0.
     */
-    clk::time_point get_update_timestamp() {
+    clk::time_point get_update_timestamp() const {
        return _update_timestamp;
    }

@@ -136,16 +158,34 @@ public:
        _update_timestamp = clk::now();
    }

-    bool is_alive() {
+    bool is_alive() const {
        return _is_alive;
    }

+    void set_alive(bool alive) {
+        _is_alive = alive;
+    }
+
    void mark_alive() {
-        _is_alive = true;
+        set_alive(true);
    }

    void mark_dead() {
-        _is_alive = false;
+        set_alive(false);
+    }
+
+    bool is_shutdown() const {
+        auto* app_state = get_application_state_ptr(application_state::STATUS);
+        if (!app_state) {
+            return false;
+        }
+        auto value = app_state->value;
+        std::vector<sstring> pieces;
+        boost::split(pieces, value, boost::is_any_of(","));
+        if (pieces.empty()) {
+            return false;
+        }
+        return pieces[0] == sstring(versioned_value::SHUTDOWN);
    }

    friend std::ostream& operator<<(std::ostream& os, const endpoint_state& x);
--- a/gms/failure_detector.cc
+++ b/gms/failure_detector.cc
@@ -36,6 +36,7 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <boost/range/adaptor/map.hpp>
 #include "gms/failure_detector.hh"
 #include "gms/gossiper.hh"
 #include "gms/i_failure_detector.hh"
@@ -43,6 +44,7 @@
 #include "gms/endpoint_state.hh"
 #include "gms/application_state.hh"
 #include "gms/inet_address.hh"
+#include "service/storage_service.hh"
 #include "log.hh"
 #include <iostream>
 #include <chrono>
@@ -56,46 +58,26 @@ constexpr std::chrono::milliseconds failure_detector::DEFAULT_MAX_PAUSE;
 using clk = arrival_window::clk;

 static clk::duration get_initial_value() {
-#if 0
-    String newvalue = System.getProperty("cassandra.fd_initial_value_ms");
-    if (newvalue == null)
-    {
-        return Gossiper.intervalInMillis * 2;
-    }
-    else
-    {
-        logger.info("Overriding FD INITIAL_VALUE to {}ms", newvalue);
-        return Integer.parseInt(newvalue);
-    }
-#endif
-    warn(unimplemented::cause::GOSSIP);
-    return std::chrono::seconds(2);
+    auto& cfg = service::get_local_storage_service().db().local().get_config();
+    return std::chrono::milliseconds(cfg.fd_initial_value_ms());
 }

 clk::duration arrival_window::get_max_interval() {
-#if 0
-    sstring newvalue = System.getProperty("cassandra.fd_max_interval_ms");
-    if (newvalue == null)
-    {
-        return failure_detector.INITIAL_VALUE_NANOS;
-    }
-    else
-    {
-        logger.info("Overriding FD MAX_INTERVAL to {}ms", newvalue);
-        return TimeUnit.NANOSECONDS.convert(Integer.parseInt(newvalue), TimeUnit.MILLISECONDS);
-    }
-#endif
-    warn(unimplemented::cause::GOSSIP);
-    return get_initial_value();
+    auto& cfg = service::get_local_storage_service().db().local().get_config();
+    return std::chrono::milliseconds(cfg.fd_max_interval_ms());
+}
+
+static clk::duration get_min_interval() {
+    return gossiper::INTERVAL;
 }

 void arrival_window::add(clk::time_point value, const gms::inet_address& ep) {
    if (_tlast > clk::time_point::min()) {
        auto inter_arrival_time = value - _tlast;
-        if (inter_arrival_time <= get_max_interval()) {
+        if (inter_arrival_time <= get_max_interval() && inter_arrival_time >= get_min_interval()) {
            _arrival_intervals.add(inter_arrival_time.count());
        } else  {
-            logger.debug("failure_detector: Ignoring interval time of {} for {}", inter_arrival_time.count(), ep);
+            logger.debug("failure_detector: Ignoring interval time of {} for {}, mean={}, size={}", inter_arrival_time.count(), ep, mean(), size());
        }
    } else {
        // We use a very large initial interval since the "right" average depends on the cluster size
@@ -145,39 +127,27 @@ std::map<sstring, sstring> failure_detector::get_simple_states() {
        auto& state = entry.second;
        std::stringstream ss;
        ss << ep;
-        if (state.is_alive())
+
+        if (state.is_alive()) {
            nodes_status.emplace(sstring(ss.str()), "UP");
-        else
+        } else {
            nodes_status.emplace(sstring(ss.str()), "DOWN");
+        }
    }
    return nodes_status;
 }

 int failure_detector::get_down_endpoint_count() {
-    int count = 0;
-    for (auto& entry : get_local_gossiper().endpoint_state_map) {
-        auto& state = entry.second;
-        if (!state.is_alive()) {
-            count++;
-        }
-    }
-    return count;
+    return get_local_gossiper().endpoint_state_map.size() - get_up_endpoint_count();
 }

 int failure_detector::get_up_endpoint_count() {
-    int count = 0;
-    for (auto& entry : get_local_gossiper().endpoint_state_map) {
-        auto& state = entry.second;
-        if (state.is_alive()) {
-            count++;
-        }
-    }
-    return count;
+    return boost::count_if(get_local_gossiper().endpoint_state_map | boost::adaptors::map_values, std::mem_fn(&endpoint_state::is_alive));
 }

 sstring failure_detector::get_endpoint_state(sstring address) {
    std::stringstream ss;
-    auto eps = get_local_gossiper().get_endpoint_state_for_endpoint(inet_address(address));
+    auto* eps = get_local_gossiper().get_endpoint_state_for_endpoint_ptr(inet_address(address));
    if (eps) {
        append_endpoint_state(ss, *eps);
        return sstring(ss.str());
@@ -186,7 +156,7 @@ sstring failure_detector::get_endpoint_state(sstring address) {
    }
 }

-void failure_detector::append_endpoint_state(std::stringstream& ss, endpoint_state& state) {
+void failure_detector::append_endpoint_state(std::stringstream& ss, const endpoint_state& state) {
    ss << "  generation:" << state.get_heart_beat_state().get_generation() << "\n";
    ss << "  heartbeat:" << state.get_heart_beat_state().get_heart_beat_version() << "\n";
    for (const auto& entry : state.get_application_state_map()) {
--- a/gms/failure_detector.hh
+++ b/gms/failure_detector.hh
@@ -58,7 +58,7 @@ class endpoint_state;

 class arrival_window {
 public:
-    using clk = std::chrono::system_clock;
+    using clk = seastar::lowres_system_clock;
 private:
    clk::time_point _tlast{clk::time_point::min()};
    utils::bounded_stats_deque _arrival_intervals;
@@ -87,6 +87,8 @@ public:
    // see CASSANDRA-2597 for an explanation of the math at work here.
    double phi(clk::time_point tnow);

+    size_t size() { return _arrival_intervals.size(); }
+
    friend std::ostream& operator<<(std::ostream& os, const arrival_window& w);

 };
@@ -154,7 +156,7 @@ public:
    }

 private:
-    void append_endpoint_state(std::stringstream& ss, endpoint_state& state);
+    void append_endpoint_state(std::stringstream& ss, const endpoint_state& state);

 public:
    /**
--- a/gms/feature.hh
+++ b/gms/feature.hh
@@ -21,6 +21,8 @@

 #pragma once

+#include <seastar/core/shared_future.hh>
+
 namespace gms {

 /**
@@ -31,19 +33,16 @@ namespace gms {
 */
 class feature final {
    sstring _name;
-    bool _enabled;
+    bool _enabled = false;
+    mutable shared_promise<> _pr;
    friend class gossiper;
 public:
    explicit feature(sstring name, bool enabled = false);
+    feature() = default;
    ~feature();
-    feature()
-            : _enabled(false)
-    { }
-    feature(const feature& other)
-            : feature(other._name, other._enabled)
-    { }
+    feature(const feature& other) = delete;
    void enable();
-    feature& operator=(feature other);
+    feature& operator=(feature&& other);
    const sstring& name() const {
        return _name;
    }
@@ -53,6 +52,7 @@ public:
    friend inline std::ostream& operator<<(std::ostream& os, const feature& f) {
        return os << "{ gossip feature = " << f._name << " }";
    }
+    future<> when_enabled() const { return _pr.get_shared_future(); }
 };

 } // namespace gms
--- a/gms/gossip_digest_ack.hh
+++ b/gms/gossip_digest_ack.hh
@@ -68,7 +68,11 @@ public:
        return _digests;
    }

-    std::map<inet_address, endpoint_state> get_endpoint_state_map() const {
+    std::map<inet_address, endpoint_state>& get_endpoint_state_map() {
+        return _map;
+    }
+
+    const std::map<inet_address, endpoint_state>& get_endpoint_state_map() const {
        return _map;
    }

--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -49,6 +49,7 @@
 #include "gms/application_state.hh"
 #include "gms/endpoint_state.hh"
 #include "gms/feature.hh"
+#include "utils/loading_shared_values.hh"
 #include "message/messaging_service_fwd.hh"
 #include <boost/algorithm/string.hpp>
 #include <experimental/optional>
@@ -80,9 +81,9 @@ class i_failure_detector;
 * Upon hearing a GossipShutdownMessage, this module will instantly mark the remote node as down in
 * the Failure Detector.
 */
-class gossiper : public i_failure_detection_event_listener, public seastar::async_sharded_service<gossiper> {
+class gossiper : public i_failure_detection_event_listener, public seastar::async_sharded_service<gossiper>, public seastar::peering_sharded_service<gossiper> {
 public:
-    using clk = std::chrono::system_clock;
+    using clk = seastar::lowres_system_clock;
 private:
    using messaging_verb = netw::messaging_verb;
    using messaging_service = netw::messaging_service;
@@ -105,6 +106,7 @@ private:
    std::set<inet_address> _seeds_from_config;
    sstring _cluster_name;
    semaphore _callback_running{1};
+    semaphore _apply_state_locally_semaphore{100};
 public:
    future<> timer_callback_lock() { return _callback_running.wait(); }
    void timer_callback_unlock() { _callback_running.signal(); }
@@ -118,10 +120,18 @@ public:
    void set_seeds(std::set<inet_address> _seeds);
 public:
    static clk::time_point inline now() { return clk::now(); }
+public:
+    using endpoint_locks_map = utils::loading_shared_values<inet_address, semaphore>;
+    struct endpoint_permit {
+        endpoint_locks_map::entry_ptr _ptr;
+        semaphore_units<> _units;
+    };
+    future<endpoint_permit> lock_endpoint(inet_address);
 public:
    /* map where key is the endpoint and value is the state associated with the endpoint */
    std::unordered_map<inet_address, endpoint_state> endpoint_state_map;
-    std::unordered_map<inet_address, endpoint_state> shadow_endpoint_state_map;
+    // Used for serializing changes to endpoint_state_map and running of associated change listeners.
+    endpoint_locks_map endpoint_locks;

    const std::vector<sstring> DEAD_STATES = {
        versioned_value::REMOVING_TOKEN,
@@ -192,7 +202,7 @@ private:
    std::unordered_set<inet_address> _pending_mark_alive_endpoints;

    /* unreachable member set */
-    std::map<inet_address, clk::time_point> _unreachable_endpoints;
+    std::unordered_map<inet_address, clk::time_point> _unreachable_endpoints;

    /* initial seeds for joining the cluster */
    std::set<inet_address> _seeds;
@@ -209,10 +219,19 @@ private:

    clk::time_point _last_processed_message_at = now();

-    std::map<inet_address, clk::time_point> _shadow_unreachable_endpoints;
+    std::unordered_map<inet_address, clk::time_point> _shadow_unreachable_endpoints;
    std::vector<inet_address> _shadow_live_endpoints;

    void run();
+    // Replicates given endpoint_state to all other shards.
+    // The state state doesn't have to be kept alive around until completes.
+    future<> replicate(inet_address, const endpoint_state&);
+    // Replicates "states" from "src" to all other shards.
+    // "src" and "states" must be kept alive until completes and must not change.
+    future<> replicate(inet_address, const std::map<application_state, versioned_value>& src, const std::vector<application_state>& states);
+    // Replicates given value to all other shards.
+    // The value must be kept alive until completes and not change.
+    future<> replicate(inet_address, application_state key, const versioned_value& value);
 public:
    gossiper();

@@ -384,7 +403,15 @@ private:
 public:
    clk::time_point get_expire_time_for_endpoint(inet_address endpoint);

-    std::experimental::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep) const;
+    const endpoint_state* get_endpoint_state_for_endpoint_ptr(inet_address ep) const;
+    endpoint_state& get_endpoint_state(inet_address ep);
+
+    endpoint_state* get_endpoint_state_for_endpoint_ptr(inet_address ep);
+
+    const versioned_value* get_application_state_ptr(inet_address endpoint, application_state appstate) const;
+
+    // Use with caution, copies might be expensive (see #764)
+    stdx::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep) const;

    // removes ALL endpoint states; should only be called after shadow gossip
    void reset_endpoint_state_map();
@@ -393,8 +420,6 @@ public:

    bool uses_host_id(inet_address endpoint);

-    bool uses_vnodes(inet_address endpoint);
-
    utils::UUID get_host_id(inet_address endpoint);

    std::experimental::optional<endpoint_state> get_state_for_version_bigger_than(inet_address for_endpoint, int version);
@@ -404,10 +429,10 @@ public:
     */
    int compare_endpoint_startup(inet_address addr1, inet_address addr2);

-    void notify_failure_detector(std::map<inet_address, endpoint_state> remoteEpStateMap);
+    void notify_failure_detector(const std::map<inet_address, endpoint_state>& remoteEpStateMap);


-    void notify_failure_detector(inet_address endpoint, endpoint_state remote_endpoint_state);
+    void notify_failure_detector(inet_address endpoint, const endpoint_state& remote_endpoint_state);

 private:
    void mark_alive(inet_address addr, endpoint_state& local_state);
@@ -425,10 +450,10 @@ private:
    void handle_major_state_change(inet_address ep, const endpoint_state& eps);

 public:
-    bool is_alive(inet_address ep);
+    bool is_alive(inet_address ep) const;
    bool is_dead_state(const endpoint_state& eps) const;

-    future<> apply_state_locally(const std::map<inet_address, endpoint_state>& map);
+    future<> apply_state_locally(std::map<inet_address, endpoint_state> map);

 private:
    void apply_new_states(inet_address addr, endpoint_state& local_state, const endpoint_state& remote_state);
@@ -488,11 +513,11 @@ public:
    future<> do_stop_gossiping();

 public:
-    bool is_enabled();
+    bool is_enabled() const;

    void finish_shadow_round();

-    bool is_in_shadow_round();
+    bool is_in_shadow_round() const;

    void goto_shadow_round();

@@ -504,7 +529,9 @@ public:
    void dump_endpoint_state_map();
    void debug_show();
 public:
+    bool is_seed(const inet_address& endpoint) const;
    bool is_shutdown(const inet_address& endpoint) const;
+    bool is_normal(const inet_address& endpoint) const;
    bool is_silent_shutdown_state(const endpoint_state& ep_state) const;
    void mark_as_shutdown(const inet_address& endpoint);
    void force_newer_generation();
@@ -534,10 +561,10 @@ private:
 public:
    void check_knows_remote_features(sstring local_features_string) const;
    void check_knows_remote_features(sstring local_features_string, std::unordered_map<inet_address, sstring> peer_features_string) const;
+    void maybe_enable_features();
 private:
    void register_feature(feature* f);
    void unregister_feature(feature* f);
-    void maybe_enable_features();
 private:
    seastar::metrics::metric_groups _metrics;
 };
--- a/idl/frozen_schema.idl.hh
+++ b/idl/frozen_schema.idl.hh
@@ -27,8 +27,9 @@ class schema_mutations {
    canonical_mutation columnfamilies_canonical_mutation();
    canonical_mutation columns_canonical_mutation();
    bool is_view()[[version 1.6]];
-    std::experimental::optional<canonical_mutation> indices_canonical_mutation()[[version 1.9]];
-    std::experimental::optional<canonical_mutation> dropped_columns_canonical_mutation()[[version 1.9]];
+    std::experimental::optional<canonical_mutation> indices_canonical_mutation()[[version 2.0]];
+    std::experimental::optional<canonical_mutation> dropped_columns_canonical_mutation()[[version 2.0]];
+    std::experimental::optional<canonical_mutation> scylla_tables_canonical_mutation()[[version 2.0]];
 };

 class schema stub [[writable]] {
--- a/keys.hh
+++ b/keys.hh
@@ -182,6 +182,9 @@ public:
    static TopLevel from_exploded(const schema& s, const std::vector<bytes>& v) {
        return from_exploded(v);
    }
+    static TopLevel from_exploded_view(const std::vector<bytes_view>& v) {
+        return from_exploded(v);
+    }

    // We don't allow optional values, but provide this method as an efficient adaptor
    static TopLevel from_optional_exploded(const schema& s, const std::vector<bytes_opt>& v) {
--- a/locator/ec2_multi_region_snitch.cc
+++ b/locator/ec2_multi_region_snitch.cc
@@ -100,7 +100,6 @@ future<> ec2_multi_region_snitch::gossiper_starting() {
    // Note: currently gossiper "main" instance always runs on CPU0 therefore
    // this function will be executed on CPU0 only.
    //
-    ec2_snitch::gossiper_starting();

    using namespace gms;
    auto& g = get_local_gossiper();
--- a/locator/production_snitch_base.hh
+++ b/locator/production_snitch_base.hh
@@ -126,14 +126,9 @@ private:
    sstring get_endpoint_info(inet_address endpoint, gms::application_state key,
                              const sstring& default_val) {
        gms::gossiper& local_gossiper = gms::get_local_gossiper();
-        auto state = local_gossiper.get_endpoint_state_for_endpoint(endpoint);
-
-        // First, look in the gossiper::endpoint_state_map...
-        if (state) {
-            auto ep_state = state->get_application_state(key);
-            if (ep_state) {
-                return ep_state->value;
-            }
+        auto* ep_state = local_gossiper.get_application_state_ptr(endpoint, key);
+        if (ep_state) {
+            return ep_state->value;
        }

        // ...if not found - look in the SystemTable...
--- a/locator/reconnectable_snitch_helper.hh
+++ b/locator/reconnectable_snitch_helper.hh
@@ -56,7 +56,7 @@ private:

 private:

-    void reconnect(gms::inet_address public_address, gms::versioned_value local_address_value) {
+    void reconnect(gms::inet_address public_address, const gms::versioned_value& local_address_value) {
        reconnect(public_address, gms::inet_address(local_address_value.value));
    }

@@ -97,10 +97,9 @@ public:
    }

    void on_join(gms::inet_address endpoint, gms::endpoint_state ep_state) override {
-        auto internal_ip_state_opt = ep_state.get_application_state(gms::application_state::INTERNAL_IP);
-
-        if (internal_ip_state_opt) {
-            reconnect(endpoint, *internal_ip_state_opt);
+        auto* internal_ip_state = ep_state.get_application_state_ptr(gms::application_state::INTERNAL_IP);
+        if (internal_ip_state) {
+            reconnect(endpoint, *internal_ip_state);
        }
    }

@@ -111,11 +110,7 @@ public:
    }

    void on_alive(gms::inet_address endpoint, gms::endpoint_state ep_state) override {
-        auto internal_ip_state_opt = ep_state.get_application_state(gms::application_state::INTERNAL_IP);
-
-        if (internal_ip_state_opt) {
-            reconnect(endpoint, *internal_ip_state_opt);
-        }
+        on_join(std::move(endpoint), std::move(ep_state));
    }

    void on_dead(gms::inet_address endpoint, gms::endpoint_state ep_state) override {
--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -110,7 +110,11 @@ void token_metadata::update_normal_tokens(std::unordered_map<inet_address, std::
        inet_address endpoint = i.first;
        std::unordered_set<token>& tokens = i.second;

-        assert(!tokens.empty());
+        if (tokens.empty()) {
+            auto msg = sprint("tokens is empty in update_normal_tokens");
+            tlogger.error("{}", msg);
+            throw std::runtime_error(msg);
+        }

        for(auto it = _token_to_endpoint_map.begin(), ite = _token_to_endpoint_map.end(); it != ite;) {
            if(it->second == endpoint) {
@@ -141,7 +145,11 @@ void token_metadata::update_normal_tokens(std::unordered_map<inet_address, std::
 }

 size_t token_metadata::first_token_index(const token& start) const {
-    assert(_sorted_tokens.size() > 0);
+    if (_sorted_tokens.empty()) {
+        auto msg = sprint("sorted_tokens is empty in first_token_index!");
+        tlogger.error("{}", msg);
+        throw std::runtime_error(msg);
+    }
    auto it = std::lower_bound(_sorted_tokens.begin(), _sorted_tokens.end(), start);
    if (it == _sorted_tokens.end()) {
        return 0;
@@ -292,7 +300,11 @@ void token_metadata::add_bootstrap_tokens(std::unordered_set<token> tokens, inet
 }

 void token_metadata::remove_bootstrap_tokens(std::unordered_set<token> tokens) {
-    assert(!tokens.empty());
+    if (tokens.empty()) {
+        auto msg = sprint("tokens is empty in remove_bootstrap_tokens!");
+        tlogger.error("{}", msg);
+        throw std::runtime_error(msg);
+    }
    for (auto t : tokens) {
        _bootstrap_tokens.erase(t);
    }
@@ -320,7 +332,11 @@ void token_metadata::remove_from_moving(inet_address endpoint) {
 token token_metadata::get_predecessor(token t) {
    auto& tokens = sorted_tokens();
    auto it = std::lower_bound(tokens.begin(), tokens.end(), t);
-    assert(it != tokens.end() && *it == t);
+    if (it == tokens.end() || *it != t) {
+        auto msg = sprint("token error in get_predecessor!");
+        tlogger.error("{}", msg);
+        throw std::runtime_error(msg);
+    }
    if (it == tokens.begin()) {
        // If the token is the first element, its preprocessor is the last element
        return tokens.back();
--- a/main.cc
+++ b/main.cc
@@ -59,6 +59,8 @@ thread_local disk_error_signal_type commit_error;
 thread_local disk_error_signal_type general_disk_error;
 seastar::metrics::metric_groups app_metrics;

+using namespace std::chrono_literals;
+
 namespace bpo = boost::program_options;

 static boost::filesystem::path relative_conf_dir(boost::filesystem::path path) {
@@ -277,7 +279,10 @@ int main(int ac, char** av) {
    }
    runtime::init_uptime();
    std::setvbuf(stdout, nullptr, _IOLBF, 1000);
-    app_template app;
+    app_template::config app_cfg;
+    app_cfg.name = "Scylla";
+    app_cfg.default_task_quota = 500us;
+    app_template app(std::move(app_cfg));
    auto opt_add = app.add_options();

    auto cfg = make_lw_shared<db::config>();
@@ -529,12 +534,12 @@ int main(int ac, char** av) {
            db::get_batchlog_manager().start(std::ref(qp)).get();
            // #293 - do not stop anything
            // engine().at_exit([] { return db::get_batchlog_manager().stop(); });
-            sstables::init_metrics();
+            sstables::init_metrics().get();

            db::system_keyspace::minimal_setup(db, qp);

            // schema migration, if needed, is also done on shard 0
-            db::legacy_schema_migrator::migrate(qp.local()).get();
+            db::legacy_schema_migrator::migrate(proxy, qp.local()).get();

            supervisor::notify("loading sstables");

@@ -625,13 +630,13 @@ int main(int ac, char** av) {
            lb->start_broadcasting();
            service::get_local_storage_service().set_load_broadcaster(lb);
            engine().at_exit([lb = std::move(lb)] () mutable { return lb->stop_broadcasting(); });
+            supervisor::notify("starting cf cache hit rate calculator");
            cf_cache_hitrate_calculator.start(std::ref(db), std::ref(cf_cache_hitrate_calculator)).get();
            engine().at_exit([&cf_cache_hitrate_calculator] { return cf_cache_hitrate_calculator.stop(); });
            cf_cache_hitrate_calculator.local().run_on(engine().cpu_id());
-            supervisor::notify("starting native transport");
-            gms::get_local_gossiper().wait_for_gossip_to_settle();
+            gms::get_local_gossiper().wait_for_gossip_to_settle().get();
            api::set_server_gossip_settle(ctx).get();
-            supervisor::notify("starting cf cache hit rate calculator");
+            supervisor::notify("starting native transport");
            service::get_local_storage_service().start_native_transport().get();
            if (start_thrift) {
                service::get_local_storage_service().start_rpc_server().get();
--- a/memtable-sstable.hh
+++ b/memtable-sstable.hh
@@ -29,11 +29,13 @@
 #include "sstables/sstables.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/file.hh>
+#include <seastar/core/thread.hh>

 future<>
 write_memtable_to_sstable(memtable& mt,
        sstables::shared_sstable sst,
        bool backup = false,
        const io_priority_class& pc = default_priority_class(),
-        bool leave_unsealed = false);
+        bool leave_unsealed = false,
+        seastar::thread_scheduling_group* tsg = nullptr);

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -514,7 +514,6 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }();

    auto remote_addr = ipv4_addr(get_preferred_ip(id.addr).raw_addr(), must_encrypt ? _ssl_port : _port);
-    auto local_addr = ipv4_addr{_listen_address.raw_addr(), 0};

    rpc::client_options opts;
    // send keepalive messages each minute if connection is idle, drop connection after 10 failures
@@ -526,9 +525,9 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge

    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
-                                    remote_addr, local_addr, _credentials) :
+                                    remote_addr, ipv4_addr(), _credentials) :
                    ::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
-                                    remote_addr, local_addr);
+                                    remote_addr);

    it = _clients[idx].emplace(id, shard_info(std::move(client))).first;
    uint32_t src_cpu_id = engine().cpu_id();
@@ -640,59 +639,6 @@ auto send_message_timeout(messaging_service* ms, messaging_verb verb, msg_addr i
    });
 }

-template <typename MsgIn, typename... MsgOut>
-auto send_message_timeout_and_retry(messaging_service* ms, messaging_verb verb, msg_addr id,
-        std::chrono::seconds timeout, int nr_retry, std::chrono::seconds wait, MsgOut... msg) {
-    using MsgInTuple = typename futurize_t<MsgIn>::value_type;
-    return do_with(int(nr_retry), std::move(msg)..., [ms, verb, id, timeout, wait, nr_retry] (auto& retry, const auto&... messages) {
-        return repeat_until_value([ms, verb, id, timeout, wait, nr_retry, &retry, &messages...] {
-            return send_message_timeout<MsgIn>(ms, verb, id, timeout, messages...).then_wrapped(
-                    [ms, verb, id, timeout, wait, nr_retry, &retry] (auto&& f) mutable {
-                auto vb = int(verb);
-                try {
-                    MsgInTuple ret = f.get();
-                    if (retry != nr_retry) {
-                        mlogger.info("Retry verb={} to {}, retry={}: OK", vb, id, retry);
-                    }
-                    return make_ready_future<stdx::optional<MsgInTuple>>(std::move(ret));
-                } catch (rpc::timeout_error) {
-                    mlogger.info("Retry verb={} to {}, retry={}: timeout in {} seconds", vb, id, retry, timeout.count());
-                    throw;
-                } catch (rpc::closed_error) {
-                    mlogger.info("Retry verb={} to {}, retry={}: {}", vb, id, retry, std::current_exception());
-                    // Stop retrying if retry reaches 0 or message service is shutdown
-                    // or the remote node is removed from gossip (on_remove())
-                    retry--;
-                    if (retry == 0) {
-                        mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: retry == 0", vb, id, retry);
-                        throw;
-                    }
-                    if (ms->is_stopping()) {
-                        mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: messaging_service is stopped",
-                                     vb, id, retry);
-                        throw;
-                    }
-                    if (!gms::get_local_gossiper().is_known_endpoint(id.addr)) {
-                        mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: node is removed from the cluster",
-                                     vb, id, retry);
-                        throw;
-                    }
-                    return sleep_abortable(wait).then([] {
-                        return make_ready_future<stdx::optional<MsgInTuple>>(stdx::nullopt);
-                    }).handle_exception([vb, id, retry] (std::exception_ptr ep) {
-                        mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: {}", vb, id, retry, ep);
-                        return make_exception_future<stdx::optional<MsgInTuple>>(ep);
-                    });
-                } catch (...) {
-                    throw;
-                }
-            });
-        }).then([ms = ms->shared_from_this()] (MsgInTuple result) {
-            return futurize<MsgIn>::from_tuple(std::move(result));
-        });
-    });
-}
-
 // Send one way message for verb
 template <typename... MsgOut>
 auto send_message_oneway(messaging_service* ms, messaging_verb verb, msg_addr id, MsgOut&&... msg) {
@@ -707,13 +653,6 @@ auto send_message_oneway_timeout(messaging_service* ms, Timeout timeout, messagi

 // Wrappers for verbs

-// Retransmission parameters for streaming verbs.
-// A stream plan gives up retrying in 10*30 + 10*60 seconds (15 minutes) at
-// most, 10*30 seconds (5 minutes) at least.
-static constexpr int streaming_nr_retry = 10;
-static constexpr std::chrono::seconds streaming_timeout{10*60};
-static constexpr std::chrono::seconds streaming_wait_before_retry{30};
-
 // PREPARE_MESSAGE
 void messaging_service::register_prepare_message(std::function<future<streaming::prepare_message> (const rpc::client_info& cinfo,
        streaming::prepare_message msg, UUID plan_id, sstring description)>&& func) {
@@ -721,8 +660,7 @@ void messaging_service::register_prepare_message(std::function<future<streaming:
 }
 future<streaming::prepare_message> messaging_service::send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
        sstring description) {
-    return send_message_timeout_and_retry<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
-        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
+    return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
        std::move(msg), plan_id, std::move(description));
 }

@@ -731,8 +669,7 @@ void messaging_service::register_prepare_done_message(std::function<future<> (co
    register_handler(this, messaging_verb::PREPARE_DONE_MESSAGE, std::move(func));
 }
 future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id) {
-    return send_message_timeout_and_retry<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
-        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
+    return send_message<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
        plan_id, dst_cpu_id);
 }

@@ -741,8 +678,7 @@ void messaging_service::register_stream_mutation(std::function<future<> (const r
    register_handler(this, messaging_verb::STREAM_MUTATION, std::move(func));
 }
 future<> messaging_service::send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented) {
-    return send_message_timeout_and_retry<void>(this, messaging_verb::STREAM_MUTATION, id,
-        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
+    return send_message<void>(this, messaging_verb::STREAM_MUTATION, id,
        plan_id, std::move(fm), dst_cpu_id, fragmented);
 }

@@ -757,19 +693,17 @@ void messaging_service::register_stream_mutation_done(std::function<future<> (co
    });
 }
 future<> messaging_service::send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id) {
-    return send_message_timeout_and_retry<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
-        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
+    return send_message<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
        plan_id, std::move(ranges), cf_id, dst_cpu_id);
 }

 // COMPLETE_MESSAGE
-void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
+void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func) {
    register_handler(this, messaging_verb::COMPLETE_MESSAGE, std::move(func));
 }
-future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id) {
-    return send_message_timeout_and_retry<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
-        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
-        plan_id, dst_cpu_id);
+future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed) {
+    return send_message<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
+        plan_id, dst_cpu_id, failed);
 }

 void messaging_service::register_gossip_echo(std::function<future<> ()>&& func) {
@@ -835,7 +769,7 @@ future<> messaging_service::send_definitions_update(msg_addr id, std::vector<fro
    return send_message_oneway(this, messaging_verb::DEFINITIONS_UPDATE, std::move(id), std::move(fm));
 }

-void messaging_service::register_migration_request(std::function<future<std::vector<frozen_mutation>> ()>&& func) {
+void messaging_service::register_migration_request(std::function<future<std::vector<frozen_mutation>> (const rpc::client_info&)>&& func) {
    register_handler(this, netw::messaging_verb::MIGRATION_REQUEST, std::move(func));
 }
 void messaging_service::unregister_migration_request() {
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -249,8 +249,8 @@ public:
    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);

-    void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
-    future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
+    void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func);
+    future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed = false);

    // Wrapper for REPAIR_CHECKSUM_RANGE verb
    void register_repair_checksum_range(std::function<future<partition_checksum> (sstring keyspace, sstring cf, dht::token_range range, rpc::optional<repair_checksum> hash_version)>&& func);
@@ -288,7 +288,7 @@ public:
    future<> send_definitions_update(msg_addr id, std::vector<frozen_mutation> fm);

    // Wrapper for MIGRATION_REQUEST
-    void register_migration_request(std::function<future<std::vector<frozen_mutation>> ()>&& func);
+    void register_migration_request(std::function<future<std::vector<frozen_mutation>> (const rpc::client_info&)>&& func);
    void unregister_migration_request();
    future<std::vector<frozen_mutation>> send_migration_request(msg_addr id);

--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -248,17 +248,14 @@ mutation_partition::mutation_partition(const mutation_partition& x, const schema
            for (const rows_entry& e : x.range(schema, r)) {
                _rows.insert(_rows.end(), *current_allocator().construct<rows_entry>(e), rows_entry::compare(schema));
            }
+            for (auto&& rt : x._row_tombstones.slice(schema, r)) {
+                _row_tombstones.apply(schema, rt);
+            }
        }
    } catch (...) {
        _rows.clear_and_dispose(current_deleter<rows_entry>());
        throw;
    }
-
-    for(auto&& r : ck_ranges) {
-        for (auto&& rt : x._row_tombstones.slice(schema, r)) {
-            _row_tombstones.apply(schema, rt);
-        }
-    }
 }

 mutation_partition::mutation_partition(mutation_partition&& x, const schema& schema,
@@ -932,15 +929,6 @@ rows_entry::equal(const schema& s, const rows_entry& other) const {
    return equal(s, other, s);
 }

-position_in_partition_view rows_entry::position() const {
-    if (_flags._last) {
-        return position_in_partition_view::after_all_clustered_rows();
-    } else {
-        return position_in_partition_view(
-            position_in_partition_view::clustering_row_tag_t(), _key);
-    }
-}
-
 bool
 rows_entry::equal(const schema& s, const rows_entry& other, const schema& other_schema) const {
    position_in_partition::equal_compare eq(s);
@@ -2119,7 +2107,7 @@ public:

 mutation_partition::mutation_partition(mutation_partition::incomplete_tag, const schema& s, tombstone t)
    : _tombstone(t)
-    , _static_row_continuous(false)
+    , _static_row_continuous(!s.has_static_columns())
    , _rows()
    , _row_tombstones(s)
 {
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -712,7 +712,15 @@ public:
    const deletable_row& row() const {
        return _row;
    }
-    position_in_partition_view position() const;
+    position_in_partition_view position() const {
+        if (_flags._last) {
+            return position_in_partition_view::after_all_clustered_rows();
+        } else {
+            return position_in_partition_view(
+                    position_in_partition_view::clustering_row_tag_t(), _key);
+        }
+    }
+
    is_continuous continuous() const { return is_continuous(_flags._continuous); }
    void set_continuous(bool value) { _flags._continuous = value; }
    void set_continuous(is_continuous value) { set_continuous(bool(value)); }
--- a/mutation_partition_serializer.cc
+++ b/mutation_partition_serializer.cc
@@ -62,8 +62,14 @@ auto write_counter_cell(Writer&& writer, atomic_cell_view c)
            counter_cell_view ccv(c);
            auto shards = std::move(value).start_value_counter_cell_full()
                                          .start_shards();
-            for (auto csv : ccv.shards()) {
-                shards.add_shards(counter_shard(csv));
+            if (service::get_local_storage_service().cluster_supports_correct_counter_order()) {
+                for (auto csv : ccv.shards()) {
+                    shards.add_shards(counter_shard(csv));
+                }
+            } else {
+                for (auto& cs : ccv.shards_compatible_with_1_7_4()) {
+                    shards.add_shards(cs);
+                }
            }
            return std::move(shards).end_shards().end_counter_cell_full();
        }
--- a/mutation_partition_view.cc
+++ b/mutation_partition_view.cc
@@ -73,8 +73,9 @@ atomic_cell read_atomic_cell(atomic_cell_variant cv)
                    // TODO: a lot of copying for something called view
                    counter_cell_builder ccb; // we know the final number of shards
                    for (auto csv : ccv.shards()) {
-                        ccb.add_shard(counter_shard(csv));
+                        ccb.add_maybe_unsorted_shard(counter_shard(csv));
                    }
+                    ccb.sort_and_remove_duplicates();
                    return ccb.build(_created_at);
                }
                atomic_cell operator()(ser::counter_cell_update_view& ccv) const {
--- a/partition_slice_builder.cc
+++ b/partition_slice_builder.cc
@@ -105,7 +105,7 @@ partition_slice_builder::with_regular_column(bytes name) {
        throw std::runtime_error(sprint("No such column: %s", _schema.regular_column_name_type()->to_string(name)));
    }
    if (!def->is_regular()) {
-        throw std::runtime_error(sprint("Column is not regular: %s", _schema.regular_column_name_type()->to_string(name)));
+        throw std::runtime_error(sprint("Column is not regular: %s", _schema.column_name_type(*def)->to_string(name)));
    }
    _regular_columns->push_back(def->id);
    return *this;
--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -41,9 +41,17 @@ inline void maybe_merge_versions(lw_shared_ptr<partition_snapshot>& snp,
    with_allocator(lsa_region.allocator(), [&snp, &lsa_region, &read_section] {
        return with_linearized_managed_bytes([&snp, &lsa_region, &read_section] {
            try {
-                read_section(lsa_region, [&snp] {
-                    snp->merge_partition_versions();
-                });
+                // Allocating sections require the region to be reclaimable
+                // which means that they cannot be nested.
+                // It is, however, possible, that if the snapshot is taken
+                // inside an allocating section and then an exception is thrown
+                // this function will be called to clean up even though we
+                // still will be in the context of the allocating section.
+                if (lsa_region.reclaiming_enabled()) {
+                    read_section(lsa_region, [&snp] {
+                        snp->merge_partition_versions();
+                    });
+                }
            } catch (...) { }
            snp = {};
        });
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -34,6 +34,8 @@
 // When the cursor is invalidated, it still maintains its previous position. It can be brought
 // back to validity by calling maybe_refresh(), or advance_to().
 //
+// Insertion of row entries after cursor's position invalidates the cursor.
+//
 class partition_snapshot_row_cursor final {
    struct position_in_version {
        mutation_partition::rows_type::iterator it;
@@ -55,6 +57,7 @@ class partition_snapshot_row_cursor final {
    logalloc::region& _region;
    partition_snapshot& _snp;
    std::vector<position_in_version> _heap;
+    std::vector<mutation_partition::rows_type::iterator> _iterators;
    std::vector<position_in_version> _current_row;
    position_in_partition _position;
    uint64_t _last_reclaim_count = 0;
@@ -78,13 +81,16 @@ public:
        , _snp(snp)
        , _position(position_in_partition::static_row_tag_t{})
    { }
-    bool has_up_to_date_row_from_latest_version() const {
-        return up_to_date() && _current_row[0].version_no == 0;
+    bool has_valid_row_from_latest_version() const {
+        return iterators_valid() && _current_row[0].version_no == 0;
    }
    mutation_partition::rows_type::iterator get_iterator_in_latest_version() const {
-        return _current_row[0].it;
+        return _iterators[0];
    }
-    bool up_to_date() const {
+
+    // Returns true iff the iterators obtained since the cursor was last made valid
+    // are still valid. Note that this doesn't mean that the cursor itself is valid.
+    bool iterators_valid() const {
        return _region.reclaim_counter() == _last_reclaim_count && _last_versions_count == _snp.version_count();
    }

@@ -97,9 +103,40 @@ public:
    //
    // but avoids work if not necessary.
    bool maybe_refresh() {
-        if (!up_to_date()) {
+        if (!iterators_valid()) {
            return advance_to(_position);
        }
+        // Refresh latest version's iterator in case there was an insertion
+        // before it and after cursor's position. There cannot be any
+        // insertions for non-latest versions, so we don't have to update them.
+        if (_current_row[0].version_no != 0) {
+            rows_entry::compare less(_schema);
+            position_in_partition::equal_compare eq(_schema);
+            position_in_version::less_compare heap_less(_schema);
+            auto& rows = _snp.version()->partition().clustered_rows();
+            auto it = _iterators[0] = rows.lower_bound(_position, less);
+            auto heap_i = boost::find_if(_heap, [](auto&& v) { return v.version_no == 0; });
+            if (it == rows.end()) {
+                if (heap_i != _heap.end()) {
+                    _heap.erase(heap_i);
+                    boost::range::make_heap(_heap, heap_less);
+                }
+            } else if (eq(_position, it->position())) {
+                _current_row.insert(_current_row.begin(), position_in_version{it, rows.end(), 0});
+                if (heap_i != _heap.end()) {
+                    _heap.erase(heap_i);
+                    boost::range::make_heap(_heap, heap_less);
+                }
+            } else {
+                if (heap_i != _heap.end()) {
+                    heap_i->it = it;
+                    boost::range::make_heap(_heap, heap_less);
+                } else {
+                    _heap.push_back({it, rows.end(), 0});
+                    boost::range::push_heap(_heap, heap_less);
+                }
+            }
+        }
        return true;
    }

@@ -119,11 +156,13 @@ public:
        position_in_version::less_compare heap_less(_schema);
        _heap.clear();
        _current_row.clear();
+        _iterators.clear();
        int version_no = 0;
        for (auto&& v : _snp.versions()) {
            auto& rows = v.partition().clustered_rows();
            auto pos = rows.lower_bound(lower_bound, less);
            auto end = rows.end();
+            _iterators.push_back(pos);
            if (pos != end) {
                _heap.push_back({pos, end, version_no});
            }
@@ -142,9 +181,10 @@ public:
    // Can be only called on a valid cursor pointing at a row.
    bool next() {
        position_in_version::less_compare heap_less(_schema);
-        assert(up_to_date());
+        assert(iterators_valid());
        for (auto&& curr : _current_row) {
            ++curr.it;
+            _iterators[curr.version_no] = curr.it;
            if (curr.it != curr.end) {
                _heap.push_back(curr);
                boost::range::push_heap(_heap, heap_less);
@@ -168,12 +208,14 @@ public:
    const clustering_key& key() const { return _current_row[0].it->key(); }

    // Can be called only when cursor is valid and pointing at a row.
-    clustering_row row() const {
-        clustering_row result(key());
-        for (auto&& v : _current_row) {
-            result.apply(_schema, *v.it);
+    mutation_fragment row() const {
+        auto it = _current_row.begin();
+        auto mf = mutation_fragment(clustering_row(*it->it));
+        auto& cr = mf.as_mutable_clustering_row();
+        for (++it; it != _current_row.end(); ++it) {
+            cr.apply(_schema, *it->it);
        }
-        return result;
+        return mf;
    }

    // Can be called when cursor is pointing at a row, even when invalid.
@@ -184,6 +226,32 @@ public:
    bool is_in_latest_version() const;
    bool previous_row_in_latest_version_has_key(const clustering_key_prefix& key) const;
    void set_continuous(bool val);
+
+    friend std::ostream& operator<<(std::ostream& out, const partition_snapshot_row_cursor& cur) {
+        out << "{cursor: position=" << cur._position << ", ";
+        if (!cur.iterators_valid()) {
+            return out << " iterators invalid}";
+        }
+        out << "current=[";
+        bool first = true;
+        for (auto&& v : cur._current_row) {
+            if (!first) {
+                out << ", ";
+            }
+            first = false;
+            out << v.version_no;
+        }
+        out << "], heap=[";
+        first = true;
+        for (auto&& v : cur._heap) {
+            if (!first) {
+                out << ", ";
+            }
+            first = false;
+            out << "{v=" << v.version_no << ", pos=" << v.it->position() << "}";
+        }
+        return out << "]}";
+    };
 };

 inline
@@ -198,8 +266,8 @@ bool partition_snapshot_row_cursor::previous_row_in_latest_version_has_key(const
    }
    auto prev_it = _current_row[0].it;
    --prev_it;
-    clustering_key_prefix::tri_compare tri_comp(_schema);
-    return tri_comp(prev_it->key(), key) == 0;
+    clustering_key_prefix::equality eq(_schema);
+    return eq(prev_it->key(), key);
 }

 inline
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -478,9 +478,9 @@ void partition_entry::apply_to_incomplete(const schema& s, partition_version* ve
        }
        range_tombstone_list& tombstones = dst.partition().row_tombstones();
        if (can_move) {
-            tombstones.apply_reversibly(s, current->partition().row_tombstones()).cancel();
+            tombstones.apply_monotonically(s, std::move(current->partition().row_tombstones()));
        } else {
-            tombstones.apply(s, current->partition().row_tombstones());
+            tombstones.apply_monotonically(s, current->partition().row_tombstones());
        }
        current = current->next();
    }
@@ -545,13 +545,19 @@ lw_shared_ptr<partition_snapshot> partition_entry::read(schema_ptr entry_schema,
 std::vector<range_tombstone>
 partition_snapshot::range_tombstones(const schema& s, position_in_partition_view start, position_in_partition_view end)
 {
+    partition_version* v = &*version();
+    if (!v->next()) {
+        return boost::copy_range<std::vector<range_tombstone>>(
+            v->partition().row_tombstones().slice(s, start, end));
+    }
    range_tombstone_list list(s);
-    for (auto&& v : versions()) {
-        for (auto&& rt : v.partition().row_tombstones().slice(s, start, end)) {
+    while (v) {
+        for (auto&& rt : v->partition().row_tombstones().slice(s, start, end)) {
            list.apply(s, rt);
        }
+        v = v->next();
    }
-    return boost::copy_range<std::vector<range_tombstone>>(list);
+    return boost::copy_range<std::vector<range_tombstone>>(list.slice(s, start, end));
 }

 std::ostream& operator<<(std::ostream& out, partition_entry& e) {
--- a/range.hh
+++ b/range.hh
@@ -352,10 +352,10 @@ public:
            return *this;
        }
    }
-    template<typename Transformer, typename U = typename std::result_of<Transformer(T)>::type>
-    static stdx::optional<typename wrapping_range<U>::bound> transform_bound(optional<bound> b, Transformer&& transformer) {
+    template<typename Bound, typename Transformer, typename U = typename std::result_of<Transformer(T)>::type>
+    static stdx::optional<typename wrapping_range<U>::bound> transform_bound(Bound&& b, Transformer&& transformer) {
        if (b) {
-            return { { transformer(std::move(*b).value()), b->is_inclusive() } };
+            return { { transformer(std::forward<Bound>(b).value().value()), b->is_inclusive() } };
        };
        return {};
    }
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -58,9 +58,10 @@ void range_tombstone_list::apply_reversibly(const schema& s,
        insert_from(s, std::move(it), std::move(start), start_kind, std::move(end), end_kind, std::move(tomb), rev);
        return;
    }
-    auto rt = current_allocator().construct<range_tombstone>(
-            std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+    auto rt = alloc_strategy_unique_ptr<range_tombstone>(current_allocator().construct<range_tombstone>(
+            std::move(start), start_kind, std::move(end), end_kind, std::move(tomb)));
    rev.insert(_tombstones.end(), *rt);
+    rt.release();
 }

 /*
@@ -104,9 +105,11 @@ void range_tombstone_list::insert_from(const schema& s,
            if (it->tomb == tomb && end_bound.adjacent(s, it->start_bound())) {
                rev.update(it, {std::move(start), start_kind, it->end, it->end_kind, tomb});
            } else {
-                auto rt = current_allocator().construct<range_tombstone>(std::move(start), start_kind, std::move(end),
-                    end_kind, tomb);
+                auto rt = alloc_strategy_unique_ptr<range_tombstone>(
+                    current_allocator().construct<range_tombstone>(std::move(start), start_kind, std::move(end),
+                    end_kind, tomb));
                rev.insert(it, *rt);
+                rt.release();
            }
            return;
        }
@@ -121,6 +124,7 @@ void range_tombstone_list::insert_from(const schema& s,
            if (less(end_bound, it->end_bound())) {
                end = it->end;
                end_kind = it->end_kind;
+                end_bound = bound_view(end, end_kind);
            }
            it = rev.erase(it);
        } else if (c > 0) {
@@ -133,7 +137,8 @@ void range_tombstone_list::insert_from(const schema& s,
                    auto rt = alloc_strategy_unique_ptr<range_tombstone>(
                        current_allocator().construct<range_tombstone>(it->start_bound(), new_end, it->tomb));
                    rev.update(it, {start_bound, it->end_bound(), it->tomb});
-                    rev.insert(it, *rt.release());
+                    rev.insert(it, *rt);
+                    rt.release();
                }
            }

@@ -142,7 +147,8 @@ void range_tombstone_list::insert_from(const schema& s,
                auto rt = alloc_strategy_unique_ptr<range_tombstone>(
                    current_allocator().construct<range_tombstone>(std::move(start), start_kind, end, end_kind, std::move(tomb)));
                rev.update(it, {std::move(end), invert_kind(end_kind), it->end, it->end_kind, it->tomb});
-                rev.insert(it, *rt.release());
+                rev.insert(it, *rt);
+                rt.release();
                return;
            }

@@ -157,16 +163,18 @@ void range_tombstone_list::insert_from(const schema& s,
                    // Here start < it->start and it->start < end.
                    auto new_end_kind = invert_kind(it->start_kind);
                    if (!less(bound_view(it->start, new_end_kind), start_bound)) {
-                        auto rt = current_allocator().construct<range_tombstone>(
-                                std::move(start), start_kind, it->start, new_end_kind, tomb);
+                        auto rt = alloc_strategy_unique_ptr<range_tombstone>(current_allocator().construct<range_tombstone>(
+                                std::move(start), start_kind, it->start, new_end_kind, tomb));
                        it = rev.insert(it, *rt);
+                        rt.release();
                        ++it;
                    }
                } else {
                    // Here start < it->start and end <= it->start, so just insert the new tombstone.
-                    auto rt = current_allocator().construct<range_tombstone>(
-                            std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+                    auto rt = alloc_strategy_unique_ptr<range_tombstone>(current_allocator().construct<range_tombstone>(
+                            std::move(start), start_kind, std::move(end), end_kind, std::move(tomb)));
                    rev.insert(it, *rt);
+                    rt.release();
                    return;
                }
            }
@@ -184,9 +192,10 @@ void range_tombstone_list::insert_from(const schema& s,
    }

    // If we got here, then just insert the remainder at the end.
-    auto rt = current_allocator().construct<range_tombstone>(
-            std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+    auto rt = alloc_strategy_unique_ptr<range_tombstone>(current_allocator().construct<range_tombstone>(
+            std::move(start), start_kind, std::move(end), end_kind, std::move(tomb)));
    rev.insert(it, *rt);
+    rt.release();
 }

 range_tombstone_list::range_tombstones_type::iterator range_tombstone_list::find(const schema& s, const range_tombstone& rt) {
@@ -355,6 +364,7 @@ range_tombstone_list::reverter::insert(range_tombstones_type::iterator it, range

 range_tombstone_list::range_tombstones_type::iterator
 range_tombstone_list::reverter::erase(range_tombstones_type::iterator it) {
+    _ops.reserve(_ops.size() + 1);
    _ops.emplace_back(erase_undo_op(*it));
    return _dst._tombstones.erase(it);
 }
@@ -413,3 +423,27 @@ bool range_tombstone_list::equal(const schema& s, const range_tombstone_list& ot
        return rt1.equal(s, rt2);
    });
 }
+
+void range_tombstone_list::apply_monotonically(const schema& s, range_tombstone_list&& list) {
+    auto del = current_deleter<range_tombstone>();
+    auto it = list.begin();
+    while (it != list.end()) {
+        // FIXME: Optimize by stealing the entry
+        apply_monotonically(s, *it);
+        it = list._tombstones.erase_and_dispose(it, del);
+    }
+}
+
+void range_tombstone_list::apply_monotonically(const schema& s, const range_tombstone_list& list) {
+    for (auto&& rt : list) {
+        apply_monotonically(s, rt);
+    }
+}
+
+void range_tombstone_list::apply_monotonically(const schema& s, const range_tombstone& rt) {
+    // FIXME: Optimize given this has relaxed exception guarantees.
+    // Note that apply() doesn't have monotonic guarantee because it doesn't restore erased entries.
+    reverter rev(s, *this);
+    apply_reversibly(s, rt.start, rt.start_kind, rt.end, rt.end_kind, rt.tomb, rev);
+    rev.cancel();
+}
--- a/range_tombstone_list.hh
+++ b/range_tombstone_list.hh
@@ -138,6 +138,19 @@ public:
        nop_reverter rev(s, *this);
        apply_reversibly(s, std::move(start), start_kind, std::move(end), end_kind, std::move(tomb), rev);
    }
+    // Monotonic exception guarantees. In case of failure the object will contain at least as much information as before the call.
+    void apply_monotonically(const schema& s, const range_tombstone& rt);
+    // Merges another list with this object.
+    // Monotonic exception guarantees. In case of failure the object will contain at least as much information as before the call.
+    void apply_monotonically(const schema& s, const range_tombstone_list& list);
+    /// Merges another list with this object.
+    /// The other list must be governed by the same allocator as this object.
+    ///
+    /// Monotonic exception guarantees. In case of failure the object will contain at least as much information as before the call.
+    /// The other list will be left in a state such that it would still commute with this object to the same state as it
+    /// would if the call didn't fail.
+    void apply_monotonically(const schema& s, range_tombstone_list&& list);
+public:
    tombstone search_tombstone_covering(const schema& s, const clustering_key_prefix& key) const;
    // Returns range of tombstones which overlap with given range
    boost::iterator_range<const_iterator> slice(const schema& s, const query::clustering_range&) const;
--- a/read_context.hh
+++ b/read_context.hh
@@ -71,7 +71,11 @@ public:
                _range = std::move(*new_range);
                _last_key = {};
            }
+            if (_reader) {
+                ++_cache._tracker._stats.underlying_recreations;
+            }
            auto& snap = _cache.snapshot_for_phase(phase);
+            _reader = {}; // See issue #2644
            _reader = _cache.create_underlying_reader(_read_context, snap, _range);
            _reader_creation_phase = phase;
        }
@@ -90,8 +94,14 @@ public:
        _range = std::move(range);
        _last_key = { };
        _new_last_key = { };
-        if (_reader && _reader_creation_phase == phase) {
-            return _reader->fast_forward_to(_range);
+        if (_reader) {
+            if (_reader_creation_phase == phase) {
+                ++_cache._tracker._stats.underlying_partition_skips;
+                return _reader->fast_forward_to(_range);
+            } else {
+                ++_cache._tracker._stats.underlying_recreations;
+                _reader = {}; // See issue #2644
+            }
        }
        _reader = _cache.create_underlying_reader(_read_context, snapshot, _range);
        _reader_creation_phase = phase;
@@ -121,6 +131,7 @@ class read_context final : public enable_lw_shared_from_this<read_context> {
    mutation_reader::forwarding _fwd_mr;
    bool _range_query;
    autoupdating_underlying_reader _underlying;
+    uint64_t _underlying_created = 0;

    // When reader enters a partition, it must be set up for reading that
    // partition from the underlying mutation source (_sm) in one of two ways:
@@ -155,7 +166,18 @@ public:
        , _fwd_mr(fwd_mr)
        , _range_query(!range.is_singular() || !range.start()->value().has_key())
        , _underlying(_cache, *this)
-    { }
+    {
+        ++_cache._tracker._stats.reads;
+    }
+    ~read_context() {
+        ++_cache._tracker._stats.reads_done;
+        if (_underlying_created) {
+            _cache._stats.reads_with_misses.mark();
+            ++_cache._tracker._stats.reads_with_misses;
+        } else {
+            _cache._stats.reads_with_no_misses.mark();
+        }
+    }
    read_context(const read_context&) = delete;
    row_cache& cache() { return _cache; }
    const schema_ptr& schema() const { return _schema; }
@@ -169,6 +191,7 @@ public:
    autoupdating_underlying_reader& underlying() { return _underlying; }
    row_cache::phase_type phase() const { return _phase; }
    const dht::decorated_key& key() const { return _sm->decorated_key(); }
+    void on_underlying_created() { ++_underlying_created; }
 private:
    future<> create_sm();
    future<> ensure_sm_created() {
@@ -198,9 +221,17 @@ public:
    // Fast forwards the underlying streamed_mutation to given range.
    future<> fast_forward_to(position_range range) {
        return ensure_sm_created().then([this, range = std::move(range)] () mutable {
+            ++_cache._tracker._stats.underlying_row_skips;
            return _sm->fast_forward_to(std::move(range));
        });
    }
+    // Returns the underlying streamed_mutation.
+    // The caller has to ensure that the streamed mutation was already created
+    // (e.g. the most recent call to enter_partition(const dht::decorated_key&, ...)
+    // was followed by a call to fast_forward_to()).
+    streamed_mutation& get_streamed_mutation() noexcept {
+        return *_sm;
+    }
    // Gets the next fragment from the underlying streamed_mutation
    future<mutation_fragment_opt> get_next_fragment() {
        return ensure_sm_created().then([this] {
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -41,11 +41,6 @@

 static logging::logger rlogger("repair");

-struct failed_range {
-    sstring cf;
-    ::dht::token_range range;
-};
-
 class repair_info {
 public:
    seastar::sharded<database>& db;
@@ -56,7 +51,7 @@ public:
    shard_id shard;
    std::vector<sstring> data_centers;
    std::vector<sstring> hosts;
-    std::vector<failed_range> failed_ranges;
+    size_t nr_failed_ranges = 0;
    // Map of peer -> <cf, ranges>
    std::unordered_map<gms::inet_address, std::unordered_map<sstring, dht::token_range_vector>> ranges_need_repair_in;
    std::unordered_map<gms::inet_address, std::unordered_map<sstring, dht::token_range_vector>> ranges_need_repair_out;
@@ -132,14 +127,11 @@ public:
        });
    }
    void check_failed_ranges() {
-        if (failed_ranges.empty()) {
-            rlogger.info("repair {} on shard {} completed successfully", id, shard);
+        if (nr_failed_ranges) {
+            rlogger.info("repair {} on shard {} failed - {} ranges failed", id, shard, nr_failed_ranges);
+            throw std::runtime_error(sprint("repair %d on shard %d failed to do checksum for %d sub ranges", id, shard, nr_failed_ranges));
        } else {
-            rlogger.info("repair {} on shard {} failed - {} ranges failed", id, shard, failed_ranges.size());
-            for (auto& frange: failed_ranges) {
-                rlogger.info("repair cf {} range {} failed", frange.cf, frange.range);
-            }
-            throw std::runtime_error(sprint("repair %d on shard %d failed to do checksum for %d sub ranges", id, shard, failed_ranges.size()));
+            rlogger.info("repair {} on shard {} completed successfully", id, shard);
        }
    }
    future<> request_transfer_ranges(const sstring& cf,
@@ -504,6 +496,19 @@ static future<partition_checksum> checksum_range_shard(database &db,
    });
 }

+// It is counter-productive to allow a large number of range checksum
+// operations to proceed in parallel (on the same shard), because the read
+// operation can already parallelize itself as much as needed, and doing
+// multiple reads in parallel just adds a lot of memory overheads.
+// So checksum_parallelism_semaphore is used to limit this parallelism,
+// and should be set to 1, or another small number.
+//
+// Note that checksumming_parallelism_semaphore applies not just in the
+// repair master, but also in the slave: The repair slave may receive many
+// checksum requests in parallel, but will only work on one or a few
+// (checksum_parallelism_semaphore) at once.
+static thread_local semaphore checksum_parallelism_semaphore(2);
+
 // Calculate the checksum of the data held on all shards of a column family,
 // in the given token range.
 // In practice, we only need to consider one or two shards which intersect the
@@ -526,7 +531,9 @@ future<partition_checksum> checksum_range(seastar::sharded<database> &db,
            auto& prs = shard_range.second;
            return db.invoke_on(shard, [keyspace, cf, prs = std::move(prs), hash_version] (database& db) mutable {
                return do_with(std::move(keyspace), std::move(cf), std::move(prs), [&db, hash_version] (auto& keyspace, auto& cf, auto& prs) {
-                    return checksum_range_shard(db, keyspace, cf, prs, hash_version);
+                    return seastar::with_semaphore(checksum_parallelism_semaphore, 1, [&db, hash_version, &keyspace, &cf, &prs] {
+                        return checksum_range_shard(db, keyspace, cf, prs, hash_version);
+                    });
                });
            }).then([&result] (partition_checksum sum) {
                result.add(sum);
@@ -537,14 +544,15 @@ future<partition_checksum> checksum_range(seastar::sharded<database> &db,
    });
 }

-// We don't need to wait for one checksum to finish before we start the
-// next, but doing too many of these operations in parallel also doesn't
-// make sense, so we limit the number of concurrent ongoing checksum
-// requests with a semaphore.
-//
-// FIXME: We shouldn't use a magic number here, but rather bind it to
-// some resource. Otherwise we'll be doing too little in some machines,
-// and too much in others.
+// parallelism_semaphore limits the number of parallel ongoing checksum
+// comparisons. This could mean, for example, that this number of checksum
+// requests have been sent to other nodes and we are waiting for them to
+// return so we can compare those to our own checksums. This limit can be
+// set fairly high because the outstanding comparisons take only few
+// resources. In particular, we do NOT do this number of file reads in
+// parallel because file reads have large memory overhads (read buffers,
+// partitions, etc.) - the number of concurrent reads is further limited
+// by an additional semaphore checksum_parallelism_semaphore (see above).
 //
 // FIXME: This would be better of in a repair service, or even a per-shard
 // repair instance holding all repair state. However, since we are anyway
@@ -576,7 +584,6 @@ static future<uint64_t> estimate_partitions(seastar::sharded<database>& db, cons
 static future<> repair_cf_range(repair_info& ri,
        sstring cf, ::dht::token_range range,
        const std::vector<gms::inet_address>& neighbors) {
-    ri.ranges_index++;
    if (neighbors.empty()) {
        // Nothing to do in this case...
        return make_ready_future<>();
@@ -584,8 +591,6 @@ static future<> repair_cf_range(repair_info& ri,

    return estimate_partitions(ri.db, ri.keyspace, cf, range).then([&ri, cf, range, &neighbors] (uint64_t estimated_partitions) {
    range_splitter ranges(range, estimated_partitions, ri.target_partitions);
-    rlogger.info("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, cf={}, range={}, target_partitions={}, estimated_partitions={}",
-            ri.ranges_index, ri.ranges.size(), ri.id, ri.shard, ri.keyspace, cf, range, ri.target_partitions, estimated_partitions);
    return do_with(seastar::gate(), true, std::move(cf), std::move(ranges),
        [&ri, &neighbors] (auto& completion, auto& success, const auto& cf, auto& ranges) {
        return do_until([&ranges] () { return !ranges.has_next(); },
@@ -626,7 +631,7 @@ static future<> repair_cf_range(repair_info& ri,
                                 utils::fb_utilities::get_broadcast_address()),
                                checksums[i].get_exception());
                            success = false;
-                            ri.failed_ranges.push_back(failed_range{cf, range});
+                            ri.nr_failed_ranges++;
                            // Do not break out of the loop here, so we can log
                            // (and discard) all the exceptions.
                        } else if (i > 0) {
@@ -751,7 +756,7 @@ static future<> repair_cf_range(repair_info& ri,
                    // any case, we need to remember that the repair failed to
                    // tell the caller.
                    success = false;
-                    ri.failed_ranges.push_back(failed_range{cf, range});
+                    ri.nr_failed_ranges++;
                    rlogger.warn("Failed sync of range {}: {}", range, eptr);
                }).finally([&completion] {
                    parallelism_semaphore.signal(1);
@@ -997,8 +1002,22 @@ static future<> repair_ranges(repair_info ri) {
        // repair all the ranges in sequence
        return do_for_each(ri.ranges, [&ri] (auto&& range) {
    #endif
-            check_in_shutdown();
-            return repair_range(ri, range);
+            ri.ranges_index++;
+            rlogger.info("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}",
+                ri.ranges_index, ri.ranges.size(), ri.id, ri.shard, ri.keyspace, ri.cfs, range);
+            return do_with(dht::selective_token_range_sharder(range, ri.shard), [&ri] (auto& sharder) {
+                return repeat([&ri, &sharder] () {
+                    check_in_shutdown();
+                    auto range_shard = sharder.next();
+                    if (range_shard) {
+                        return repair_range(ri, *range_shard).then([] {
+                            return make_ready_future<stop_iteration>(stop_iteration::no);
+                        });
+                    } else {
+                        return make_ready_future<stop_iteration>(stop_iteration::yes);
+                    }
+                });
+            });
        }).then([&ri] {
            // Do streaming for the remaining ranges we do not stream in
            // repair_cf_range
@@ -1013,27 +1032,6 @@ static future<> repair_ranges(repair_info ri) {
    });
 }

-static void split_and_add(std::vector<::dht::token_range>& ranges,
-        const dht::token_range& range) {
-    // The use of minimum_token() here twice is not a typo - because wrap-
-    // around token ranges are supported by midpoint(), the beyond-maximum
-    // token can also be represented by minimum_token().
-    auto midpoint = dht::global_partitioner().midpoint(
-            range.start() ? range.start()->value() : dht::minimum_token(),
-            range.end() ? range.end()->value() : dht::minimum_token());
-    // This shouldn't happen, but if the range included just one token, we
-    // can't split further (split() may actually fail with assertion failure)
-    if ((range.start() && midpoint == range.start()->value()) ||
-        (range.end() && midpoint == range.end()->value())) {
-        ranges.push_back(range);
-        return;
-    }
-    auto halves = range.split(midpoint, dht::token_comparator());
-    ranges.push_back(halves.first);
-    ranges.push_back(halves.second);
-}
-
-
 // repair_start() can run on any cpu; It runs on cpu0 the function
 // do_repair_start(). The benefit of always running that function on the same
 // CPU is that it allows us to keep some state (like a list of ongoing
@@ -1053,6 +1051,10 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
    rlogger.info("starting user-requested repair for keyspace {}, repair id {}, options {}", keyspace, id, options_map);
    repair_tracker.start(id);

+    if (!gms::get_local_gossiper().is_normal(utils::fb_utilities::get_broadcast_address())) {
+        throw std::runtime_error("Node is not in NORMAL status yet!");
+    }
+
    // If the "ranges" option is not explicitly specified, we repair all the
    // local ranges (the token ranges for which this node holds a replica of).
    // Each of these ranges may have a different set of replicas, so the
@@ -1125,35 +1127,12 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
        cfs = list_column_families(db.local(), keyspace);
    }

-    // Split the ranges so that we have more number of ranges than smp::count
-    // Note, the split is not a guaratnee when the range can not be split anmore.
-    dht::token_range_vector tosplit;
-    while (ranges.size() < smp::count) {
-        size_t sz = ranges.size();
-        tosplit.clear();
-        ranges.swap(tosplit);
-        for (const auto& range : tosplit) {
-            split_and_add(ranges, range);
-        }
-        if (sz == ranges.size()) {
-            // We can not split the ranges anymore
-            break;
-        }
-    }
-
-    std::map<shard_id, dht::token_range_vector> shard_ranges_map;
-    unsigned idx = 0;
-    for (auto& range : ranges) {
-        shard_ranges_map[idx++ % smp::count].push_back(std::move(range));
-    }

    std::vector<future<>> repair_results;
-    repair_results.reserve(shard_ranges_map.size());
+    repair_results.reserve(smp::count);

-    for (auto& x : shard_ranges_map) {
-        shard_id shard = x.first;
-        auto& ranges = x.second;
-        auto f = db.invoke_on(shard, [keyspace, cfs, id, ranges = std::move(ranges),
+    for (auto shard : boost::irange(unsigned(0), smp::count)) {
+        auto f = db.invoke_on(shard, [keyspace, cfs, id, ranges,
                data_centers = options.data_centers, hosts = options.hosts] (database& localdb) mutable {
            return repair_ranges(repair_info(service::get_local_storage_service().db(),
                    std::move(keyspace), std::move(ranges), std::move(cfs),
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -46,6 +46,7 @@ thread_local seastar::thread_scheduling_group row_cache::_update_thread_scheduli

 mutation_reader
 row_cache::create_underlying_reader(read_context& ctx, mutation_source& src, const dht::partition_range& pr) {
+    ctx.on_underlying_created();
    return src(_schema, pr, ctx.slice(), ctx.pc(), ctx.trace_state(), streamed_mutation::forwarding::yes);
 }

@@ -74,7 +75,7 @@ cache_tracker::cache_tracker() {
            }
            evict_last(_lru);
            --_stats.partitions;
-            ++_stats.evictions;
+            ++_stats.partition_evictions;
            ++_stats.modification_count;
            return memory::reclaiming_result::reclaimed_something;
           } catch (std::bad_alloc&) {
@@ -98,15 +99,24 @@ cache_tracker::setup_metrics() {
    _metrics.add_group("cache", {
        sm::make_gauge("bytes_used", sm::description("current bytes used by the cache out of the total size of memory"), [this] { return _region.occupancy().used_space(); }),
        sm::make_gauge("bytes_total", sm::description("total size of memory for the cache"), [this] { return _region.occupancy().total_space(); }),
-        sm::make_derive("total_operations_hits", sm::description("total number of operation hits"), _stats.hits),
-        sm::make_derive("total_operations_misses", sm::description("total number of operation misses"), _stats.misses),
-        sm::make_derive("total_operations_insertions", sm::description("total number of operation insert"), _stats.insertions),
-        sm::make_derive("total_operations_concurrent_misses_same_key", sm::description("total number of operation with misses same key"), _stats.concurrent_misses_same_key),
-        sm::make_derive("total_operations_merges", sm::description("total number of operation merged"), _stats.merges),
-        sm::make_derive("total_operations_evictions", sm::description("total number of operation eviction"), _stats.evictions),
-        sm::make_derive("total_operations_removals", sm::description("total number of operation removals"), _stats.removals),
-        sm::make_derive("total_operations_mispopulations", sm::description("number of entries not inserted by reads"), _stats.mispopulations),
-        sm::make_gauge("objects_partitions", sm::description("total number of partition objects"), _stats.partitions)
+        sm::make_derive("partition_hits", sm::description("number of partitions needed by reads and found in cache"), _stats.partition_hits),
+        sm::make_derive("partition_misses", sm::description("number of partitions needed by reads and missing in cache"), _stats.partition_misses),
+        sm::make_derive("partition_insertions", sm::description("total number of partitions added to cache"), _stats.partition_insertions),
+        sm::make_derive("row_hits", sm::description("total number of rows needed by reads and found in cache"), _stats.row_hits),
+        sm::make_derive("row_misses", sm::description("total number of rows needed by reads and missing in cache"), _stats.row_misses),
+        sm::make_derive("row_insertions", sm::description("total number of rows added to cache"), _stats.row_insertions),
+        sm::make_derive("concurrent_misses_same_key", sm::description("total number of operation with misses same key"), _stats.concurrent_misses_same_key),
+        sm::make_derive("partition_merges", sm::description("total number of partitions merged"), _stats.partition_merges),
+        sm::make_derive("partition_evictions", sm::description("total number of evicted partitions"), _stats.partition_evictions),
+        sm::make_derive("partition_removals", sm::description("total number of invalidated partitions"), _stats.partition_removals),
+        sm::make_derive("mispopulations", sm::description("number of entries not inserted by reads"), _stats.mispopulations),
+        sm::make_gauge("partitions", sm::description("total number of cached partitions"), _stats.partitions),
+        sm::make_derive("reads", sm::description("number of started reads"), _stats.reads),
+        sm::make_derive("reads_with_misses", sm::description("number of reads which had to read from sstables"), _stats.reads_with_misses),
+        sm::make_gauge("active_reads", sm::description("number of currently active reads"), [this] { return _stats.active_reads(); }),
+        sm::make_derive("sstable_reader_recreations", sm::description("number of times sstable reader was recreated due to memtable flush"), _stats.underlying_recreations),
+        sm::make_derive("sstable_partition_skips", sm::description("number of times sstable reader was fast forwarded across partitions"), _stats.underlying_partition_skips),
+        sm::make_derive("sstable_row_skips", sm::description("number of times sstable reader was fast forwarded within a partition"), _stats.underlying_row_skips),
    });
 }

@@ -127,7 +137,7 @@ void cache_tracker::clear() {
        };
        clear(_lru);
    });
-    _stats.removals += _stats.partitions;
+    _stats.partition_removals += _stats.partitions;
    _stats.partitions = 0;
    ++_stats.modification_count;
 }
@@ -141,7 +151,7 @@ void cache_tracker::touch(cache_entry& e) {
 }

 void cache_tracker::insert(cache_entry& entry) {
-    ++_stats.insertions;
+    ++_stats.partition_insertions;
    ++_stats.partitions;
    ++_stats.modification_count;
    _lru.push_front(entry);
@@ -149,20 +159,28 @@ void cache_tracker::insert(cache_entry& entry) {

 void cache_tracker::on_erase() {
    --_stats.partitions;
-    ++_stats.removals;
+    ++_stats.partition_removals;
    ++_stats.modification_count;
 }

 void cache_tracker::on_merge() {
-    ++_stats.merges;
+    ++_stats.partition_merges;
 }

-void cache_tracker::on_hit() {
-    ++_stats.hits;
+void cache_tracker::on_partition_hit() {
+    ++_stats.partition_hits;
 }

-void cache_tracker::on_miss() {
-    ++_stats.misses;
+void cache_tracker::on_partition_miss() {
+    ++_stats.partition_misses;
+}
+
+void cache_tracker::on_row_hit() {
+    ++_stats.row_hits;
+}
+
+void cache_tracker::on_row_miss() {
+    ++_stats.row_misses;
 }

 void cache_tracker::on_mispopulate() {
@@ -348,14 +366,30 @@ void cache_tracker::clear_continuity(cache_entry& ce) {
    ce.set_continuous(false);
 }

-void row_cache::on_hit() {
-    _stats.hits.mark();
-    _tracker.on_hit();
+void row_cache::on_partition_hit() {
+    _tracker.on_partition_hit();
 }

-void row_cache::on_miss() {
+void row_cache::on_partition_miss() {
+    _tracker.on_partition_miss();
+}
+
+void row_cache::on_row_hit() {
+    _stats.hits.mark();
+    _tracker.on_row_hit();
+}
+
+void row_cache::on_mispopulate() {
+    _tracker.on_mispopulate();
+}
+
+void row_cache::on_row_miss() {
    _stats.misses.mark();
-    _tracker.on_miss();
+    _tracker.on_row_miss();
+}
+
+void row_cache::on_row_insert() {
+    ++_tracker._stats.row_insertions;
 }

 class range_populating_reader {
@@ -369,6 +403,7 @@ private:
    }
    void handle_end_of_stream() {
        if (!can_set_continuity()) {
+            _cache.on_mispopulate();
            return;
        }
        if (!_reader.range().end() || !_reader.range().end()->is_inclusive()) {
@@ -379,11 +414,15 @@ private:
                if (it == _cache._partitions.begin()) {
                    if (!_last_key->_key) {
                        it->set_continuous(true);
+                    } else {
+                        _cache.on_mispopulate();
                    }
                } else {
                    auto prev = std::prev(it);
                    if (prev->key().equal(*_cache._schema, *_last_key->_key)) {
                        it->set_continuous(true);
+                    } else {
+                        _cache.on_mispopulate();
                    }
                }
            }
@@ -403,17 +442,17 @@ public:
                    handle_end_of_stream();
                    return std::move(smopt);
                }
-                _cache.on_miss();
+                _cache.on_partition_miss();
                if (_reader.creation_phase() == _cache.phase_of(smopt->decorated_key())) {
                    return _cache._read_section(_cache._tracker.region(), [&] {
                        cache_entry& e = _cache.find_or_create(smopt->decorated_key(), smopt->partition_tombstone(), _reader.creation_phase(),
                            can_set_continuity() ? &*_last_key : nullptr);
-                        _last_key = smopt->decorated_key();
+                        _last_key = row_cache::previous_entry_pointer(smopt->decorated_key());
                        return e.read(_cache, _read_context, std::move(*smopt), _reader.creation_phase());
                    });
                } else {
                    _cache._tracker.on_mispopulate();
-                    _last_key = smopt->decorated_key();
+                    _last_key = row_cache::previous_entry_pointer(smopt->decorated_key());
                    return read_directly_from_underlying(std::move(*smopt), _read_context);
                }
            }
@@ -424,7 +463,7 @@ public:
        if (!pr.start()) {
            _last_key = row_cache::previous_entry_pointer();
        } else if (!pr.start()->is_inclusive() && pr.start()->value().has_key()) {
-            _last_key = pr.start()->value().as_decorated_key();
+            _last_key = row_cache::previous_entry_pointer(pr.start()->value().as_decorated_key());
        } else {
            // Inclusive start bound, cannot set continuity flag.
            _last_key = {};
@@ -448,7 +487,7 @@ private:
    streamed_mutation read_from_entry(cache_entry& ce) {
        _cache.upgrade_entry(ce);
        _cache._tracker.touch(ce);
-        _cache.on_hit();
+        _cache.on_partition_hit();
        return ce.read(_cache, *_read_context);
    }

@@ -469,7 +508,7 @@ private:
                    }
                    cache_entry& e = _primary.entry();
                    auto sm = read_from_entry(e);
-                    _lower_bound = {e.key(), false};
+                    _lower_bound = dht::partition_range::bound{e.key(), false};
                    // Delay the call to next() so that we don't see stale continuity on next invocation.
                    _advance_primary = true;
                    return streamed_mutation_opt(std::move(sm));
@@ -478,7 +517,7 @@ private:
                        cache_entry& e = _primary.entry();
                        _secondary_range = dht::partition_range(_lower_bound ? std::move(_lower_bound) : _pr->start(),
                            dht::partition_range::bound{e.key(), false});
-                        _lower_bound = {e.key(), true};
+                        _lower_bound = dht::partition_range::bound{e.key(), true};
                        _secondary_in_progress = true;
                        return stdx::nullopt;
                    } else {
@@ -487,7 +526,7 @@ private:
                        if (!range) {
                            return stdx::nullopt;
                        }
-                        _lower_bound = {dht::ring_position::max()};
+                        _lower_bound = dht::partition_range::bound{dht::ring_position::max()};
                        _secondary_range = std::move(*range);
                        _secondary_in_progress = true;
                        return stdx::nullopt;
@@ -570,10 +609,10 @@ row_cache::make_reader(schema_ptr s,
                cache_entry& e = *i;
                _tracker.touch(e);
                upgrade_entry(e);
-                on_hit();
+                on_partition_hit();
                return make_reader_returning(e.read(*this, *ctx));
            } else {
-                on_miss();
+                on_partition_miss();
                return make_mutation_reader<single_partition_populating_reader>(*this, std::move(ctx));
            }
          });
@@ -629,6 +668,8 @@ cache_entry& row_cache::do_find_or_create_entry(const dht::decorated_key& key,
                    || (previous->_key && i != _partitions.begin()
                        && std::prev(i)->key().equal(*_schema, *previous->_key))) {
                    i->set_continuous(true);
+                } else {
+                    on_mispopulate();
                }

                return *i;
@@ -642,6 +683,7 @@ cache_entry& row_cache::find_or_create(const dht::decorated_key& key, tombstone
        _tracker.insert(*entry);
        return _partitions.insert(i, *entry);
    }, [&] (auto i) { // visit
+        _tracker.on_miss_already_populated();
        cache_entry& e = *i;
        e.partition().open_version(*e.schema(), phase).partition().apply(t);
        _tracker.touch(e);
@@ -760,7 +802,7 @@ future<> row_cache::do_update(memtable& m, Updater updater) {
                            if (m.partitions.empty()) {
                                _prev_snapshot_pos = {};
                            } else {
-                                _prev_snapshot_pos = m.partitions.begin()->key();
+                                _prev_snapshot_pos = dht::ring_position(m.partitions.begin()->key());
                            }
                        });
                        STAP_PROBE1(scylla, row_cache_update_one_batch_end, quota_before - quota);
@@ -790,13 +832,12 @@ future<> row_cache::update(memtable& m, partition_presence_checker is_present) {
            entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema());
            _tracker.touch(entry);
            _tracker.on_merge();
-        } else if (is_present(mem_e.key()) == partition_presence_checker_result::definitely_doesnt_exist) {
+        } else if (cache_i->continuous() || is_present(mem_e.key()) == partition_presence_checker_result::definitely_doesnt_exist) {
            cache_entry* entry = current_allocator().construct<cache_entry>(
                mem_e.schema(), std::move(mem_e.key()), std::move(mem_e.partition()));
+            entry->set_continuous(cache_i->continuous());
            _tracker.insert(*entry);
            _partitions.insert(cache_i, *entry);
-        } else {
-            _tracker.clear_continuity(*cache_i);
        }
    });
 }
@@ -815,6 +856,10 @@ future<> row_cache::update_invalidating(memtable& m) {
    });
 }

+void row_cache::refresh_snapshot() {
+    _underlying = _snapshot_source();
+}
+
 void row_cache::touch(const dht::decorated_key& dk) {
 _read_section(_tracker.region(), [&] {
  with_linearized_managed_bytes([&] {
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -185,23 +185,35 @@ public:
    using lru_type = bi::list<cache_entry,
        bi::member_hook<cache_entry, cache_entry::lru_link_type, &cache_entry::_lru_link>,
        bi::constant_time_size<false>>; // we need this to have bi::auto_unlink on hooks.
-private:
-    // We will try to evict large partition after that many normal evictions
-    const uint32_t _normal_large_eviction_ratio = 1000;
-    // Number of normal evictions to perform before we try to evict large partition
-    uint32_t _normal_eviction_count = _normal_large_eviction_ratio;
 public:
+    friend class row_cache;
+    friend class cache::read_context;
+    friend class cache::autoupdating_underlying_reader;
+    friend class cache::cache_streamed_mutation;
    struct stats {
-        uint64_t hits;
-        uint64_t misses;
-        uint64_t insertions;
+        uint64_t partition_hits;
+        uint64_t partition_misses;
+        uint64_t row_hits;
+        uint64_t row_misses;
+        uint64_t partition_insertions;
+        uint64_t row_insertions;
        uint64_t concurrent_misses_same_key;
-        uint64_t merges;
-        uint64_t evictions;
-        uint64_t removals;
+        uint64_t partition_merges;
+        uint64_t partition_evictions;
+        uint64_t partition_removals;
        uint64_t partitions;
        uint64_t modification_count;
        uint64_t mispopulations;
+        uint64_t underlying_recreations;
+        uint64_t underlying_partition_skips;
+        uint64_t underlying_row_skips;
+        uint64_t reads;
+        uint64_t reads_with_misses;
+        uint64_t reads_done;
+
+        uint64_t active_reads() const {
+            return reads_done - reads;
+        }
    };
 private:
    stats _stats{};
@@ -219,8 +231,10 @@ public:
    void clear_continuity(cache_entry& ce);
    void on_erase();
    void on_merge();
-    void on_hit();
-    void on_miss();
+    void on_partition_hit();
+    void on_partition_miss();
+    void on_row_hit();
+    void on_row_miss();
    void on_miss_already_populated();
    void on_mispopulate();
    allocation_strategy& allocator();
@@ -263,6 +277,8 @@ public:
    struct stats {
        utils::timed_rate_moving_average hits;
        utils::timed_rate_moving_average misses;
+        utils::timed_rate_moving_average reads_with_misses;
+        utils::timed_rate_moving_average reads_with_no_misses;
    };
 private:
    cache_tracker& _tracker;
@@ -313,8 +329,12 @@ private:
    logalloc::allocating_section _read_section;
    mutation_reader create_underlying_reader(cache::read_context&, mutation_source&, const dht::partition_range&);
    mutation_reader make_scanning_reader(const dht::partition_range&, lw_shared_ptr<cache::read_context>);
-    void on_hit();
-    void on_miss();
+    void on_partition_hit();
+    void on_partition_miss();
+    void on_row_hit();
+    void on_row_miss();
+    void on_row_insert();
+    void on_mispopulate();
    void upgrade_entry(cache_entry&);
    void invalidate_locked(const dht::decorated_key&);
    void invalidate_unwrapped(const dht::partition_range&);
@@ -422,6 +442,10 @@ public:
    // as few elements as possible.
    future<> update_invalidating(memtable&);

+    // Refreshes snapshot. Must only be used if logical state in the underlying data
+    // source hasn't changed.
+    void refresh_snapshot();
+
    // Moves given partition to the front of LRU if present in cache.
    void touch(const dht::decorated_key&);

@@ -449,7 +473,7 @@ public:
    // If it did, use invalidate() instead.
    void evict(const dht::partition_range& = query::full_partition_range);

-    auto num_entries() const {
+    size_t partitions() const {
        return _partitions.size();
    }
    const cache_tracker& get_cache_tracker() const {
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`options raid0 devices_discard_performance=Y`