release: prepare for 2.1.5

Signed-off-by: Shlomi Livne <shlomi@scylladb.com>
Update scylla-ami submodule
2018-06-19 09:05:55 +03:00 · 2018-06-17 12:12:21 +03:00 · 2018-06-17 11:33:55 +03:00 · 2018-06-14 15:05:09 +03:00 · 2018-06-14 10:52:39 +03:00 · 2018-06-14 10:52:39 +03:00
164 changed files with 6729 additions and 7022 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=2.1.5

 if test -f version
 then
--- a/auth/permissions_cache.hh
+++ b/auth/permissions_cache.hh
@@ -89,10 +89,6 @@ class permissions_cache final {
 public:
    explicit permissions_cache(const permissions_cache_config&, service&, logging::logger&);

-    future<> start() {
-        return make_ready_future<>();
-    }
-
    future <> stop() {
        return _cache.stop();
    }
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -24,7 +24,6 @@
 #include <map>

 #include <seastar/core/future-util.hh>
-#include <seastar/core/sharded.hh>
 #include <seastar/core/shared_ptr.hh>

 #include "auth/allow_all_authenticator.hh"
@@ -86,8 +85,6 @@ private:
    void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
 };

-static sharded<permissions_cache> sharded_permissions_cache{};
-
 static db::consistency_level consistency_for_user(const sstring& name) {
    if (name == meta::DEFAULT_SUPERUSER_NAME) {
        return db::consistency_level::QUORUM;
@@ -130,7 +127,8 @@ service::service(
        ::service::migration_manager& mm,
        std::unique_ptr<authorizer> a,
        std::unique_ptr<authenticator> b)
-            : _cache_config(std::move(c))
+            : _permissions_cache_config(std::move(c))
+            , _permissions_cache(nullptr)
            , _qp(qp)
            , _migration_manager(mm)
            , _authorizer(std::move(a))
@@ -240,10 +238,12 @@ future<> service::start() {
        return make_ready_future<>();
    }).then([this] {
        return when_all_succeed(_authorizer->start(), _authenticator->start());
+    }).then([this] {
+        _permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
    }).then([this] {
        return once_among_shards([this] {
            _migration_manager.register_listener(_migration_listener.get());
-            return sharded_permissions_cache.start(std::ref(_cache_config), std::ref(*this), std::ref(log));
+            return make_ready_future<>();
        });
    });
 }
@@ -251,7 +251,9 @@ future<> service::start() {
 future<> service::stop() {
    return once_among_shards([this] {
        _delayed.cancel_all();
-        return sharded_permissions_cache.stop();
+        return make_ready_future<>();
+    }).then([this] {
+        return _permissions_cache->stop();
    }).then([this] {
        return when_all_succeed(_authorizer->stop(), _authenticator->stop());
    });
@@ -335,7 +337,7 @@ future<> service::delete_user(const sstring& name) {
 }

 future<permission_set> service::get_permissions(::shared_ptr<authenticated_user> u, data_resource r) const {
-    return sharded_permissions_cache.local().get(std::move(u), std::move(r));
+    return _permissions_cache->get(std::move(u), std::move(r));
 }

 //
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -60,7 +60,8 @@ struct service_config final {
 };

 class service final {
-    permissions_cache_config _cache_config;
+    permissions_cache_config _permissions_cache_config;
+    std::unique_ptr<permissions_cache> _permissions_cache;

    cql3::query_processor& _qp;

--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -141,7 +141,16 @@ public:
                return _complete || _sasl->is_complete();
            }
            future<::shared_ptr<authenticated_user>> get_authenticated_user() const {
-                return _sasl->get_authenticated_user();
+                return futurize_apply([this] {
+                    return _sasl->get_authenticated_user().handle_exception([](auto ep) {
+                        try {
+                            std::rethrow_exception(ep);
+                        } catch (exceptions::authentication_exception&) {
+                            // return anon user
+                            return make_ready_future<::shared_ptr<authenticated_user>>(::make_shared<authenticated_user>());
+                        }
+                    });
+                });
            }
        private:
            ::shared_ptr<sasl_challenge> _sasl;
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -31,46 +31,13 @@
 #include "partition_snapshot_reader.hh"
 #include "partition_snapshot_row_cursor.hh"
 #include "read_context.hh"
+#include "flat_mutation_reader.hh"

 namespace cache {

 extern logging::logger clogger;

-class lsa_manager {
-    row_cache& _cache;
-public:
-    lsa_manager(row_cache& cache) : _cache(cache) { }
-    template<typename Func>
-    decltype(auto) run_in_read_section(const Func& func) {
-        return _cache._read_section(_cache._tracker.region(), [&func] () {
-            return with_linearized_managed_bytes([&func] () {
-                return func();
-            });
-        });
-    }
-    template<typename Func>
-    decltype(auto) run_in_update_section(const Func& func) {
-        return _cache._update_section(_cache._tracker.region(), [&func] () {
-            return with_linearized_managed_bytes([&func] () {
-                return func();
-            });
-        });
-    }
-    template<typename Func>
-    void run_in_update_section_with_allocator(Func&& func) {
-        return _cache._update_section(_cache._tracker.region(), [this, &func] () {
-            return with_linearized_managed_bytes([this, &func] () {
-                return with_allocator(_cache._tracker.region().allocator(), [this, &func] () mutable {
-                    return func();
-                });
-            });
-        });
-    }
-    logalloc::region& region() { return _cache._tracker.region(); }
-    logalloc::allocating_section& read_section() { return _cache._read_section; }
-};
-
-class cache_streamed_mutation final : public streamed_mutation::impl {
+class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
    enum class state {
        before_static_row,

@@ -154,14 +121,19 @@ class cache_streamed_mutation final : public streamed_mutation::impl {
    void maybe_add_to_cache(const range_tombstone& rt);
    void maybe_add_to_cache(const static_row& sr);
    void maybe_set_static_row_continuous();
+    void finish_reader() {
+        push_mutation_fragment(partition_end());
+        _end_of_stream = true;
+        _state = state::end_of_stream;
+    }
 public:
-    cache_streamed_mutation(schema_ptr s,
-                            dht::decorated_key dk,
-                            query::clustering_key_filter_ranges&& crr,
-                            lw_shared_ptr<read_context> ctx,
-                            lw_shared_ptr<partition_snapshot> snp,
-                            row_cache& cache)
-        : streamed_mutation::impl(std::move(s), std::move(dk), snp->partition_tombstone())
+    cache_flat_mutation_reader(schema_ptr s,
+                               dht::decorated_key dk,
+                               query::clustering_key_filter_ranges&& crr,
+                               lw_shared_ptr<read_context> ctx,
+                               lw_shared_ptr<partition_snapshot> snp,
+                               row_cache& cache)
+        : flat_mutation_reader::impl(std::move(s))
        , _snp(std::move(snp))
        , _position_cmp(*_schema)
        , _ck_ranges(std::move(crr))
@@ -175,17 +147,32 @@ public:
        , _next_row(*_schema, *_snp)
    {
        clogger.trace("csm {}: table={}.{}", this, _schema->ks_name(), _schema->cf_name());
+        push_mutation_fragment(partition_start(std::move(dk), _snp->partition_tombstone()));
    }
-    cache_streamed_mutation(const cache_streamed_mutation&) = delete;
-    cache_streamed_mutation(cache_streamed_mutation&&) = delete;
+    cache_flat_mutation_reader(const cache_flat_mutation_reader&) = delete;
+    cache_flat_mutation_reader(cache_flat_mutation_reader&&) = delete;
    virtual future<> fill_buffer() override;
-    virtual ~cache_streamed_mutation() {
+    virtual ~cache_flat_mutation_reader() {
        maybe_merge_versions(_snp, _lsa_manager.region(), _lsa_manager.read_section());
    }
+    virtual void next_partition() override {
+        clear_buffer_to_next_partition();
+        if (is_buffer_empty()) {
+            _end_of_stream = true;
+        }
+    }
+    virtual future<> fast_forward_to(const dht::partition_range&) override {
+        clear_buffer();
+        _end_of_stream = true;
+        return make_ready_future<>();
+    }
+    virtual future<> fast_forward_to(position_range pr) override {
+        throw std::bad_function_call();
+    }
 };

 inline
-future<> cache_streamed_mutation::process_static_row() {
+future<> cache_flat_mutation_reader::process_static_row() {
    if (_snp->version()->partition().static_row_continuous()) {
        _read_context->cache().on_row_hit();
        row sr = _lsa_manager.run_in_read_section([this] {
@@ -209,12 +196,11 @@ future<> cache_streamed_mutation::process_static_row() {
 }

 inline
-future<> cache_streamed_mutation::fill_buffer() {
+future<> cache_flat_mutation_reader::fill_buffer() {
    if (_state == state::before_static_row) {
        auto after_static_row = [this] {
            if (_ck_ranges_curr == _ck_ranges_end) {
-                _end_of_stream = true;
-                _state = state::end_of_stream;
+                finish_reader();
                return make_ready_future<>();
            }
            _state = state::reading_from_cache;
@@ -236,7 +222,7 @@ future<> cache_streamed_mutation::fill_buffer() {
 }

 inline
-future<> cache_streamed_mutation::do_fill_buffer() {
+future<> cache_flat_mutation_reader::do_fill_buffer() {
    if (_state == state::move_to_underlying) {
        _state = state::reading_from_underlying;
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
@@ -277,8 +263,8 @@ future<> cache_streamed_mutation::do_fill_buffer() {
 }

 inline
-future<> cache_streamed_mutation::read_from_underlying() {
-    return consume_mutation_fragments_until(_read_context->get_streamed_mutation(),
+future<> cache_flat_mutation_reader::read_from_underlying() {
+    return consume_mutation_fragments_until(_read_context->underlying().underlying(),
        [this] { return _state != state::reading_from_underlying || is_buffer_full(); },
        [this] (mutation_fragment mf) {
            _read_context->cache().on_row_miss();
@@ -361,7 +347,7 @@ future<> cache_streamed_mutation::read_from_underlying() {
 }

 inline
-void cache_streamed_mutation::maybe_update_continuity() {
+void cache_flat_mutation_reader::maybe_update_continuity() {
    if (can_populate() && (!_ck_ranges_curr->start() || _last_row.refresh(*_snp))) {
            if (_next_row.is_in_latest_version()) {
                clogger.trace("csm {}: mark {} continuous", this, _next_row.get_iterator_in_latest_version()->position());
@@ -387,7 +373,7 @@ void cache_streamed_mutation::maybe_update_continuity() {
 }

 inline
-void cache_streamed_mutation::maybe_add_to_cache(const mutation_fragment& mf) {
+void cache_flat_mutation_reader::maybe_add_to_cache(const mutation_fragment& mf) {
    if (mf.is_range_tombstone()) {
        maybe_add_to_cache(mf.as_range_tombstone());
    } else {
@@ -398,7 +384,7 @@ void cache_streamed_mutation::maybe_add_to_cache(const mutation_fragment& mf) {
 }

 inline
-void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
+void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
    if (!can_populate()) {
        _last_row = nullptr;
        _read_context->cache().on_mispopulate();
@@ -435,18 +421,18 @@ void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
 }

 inline
-bool cache_streamed_mutation::after_current_range(position_in_partition_view p) {
+bool cache_flat_mutation_reader::after_current_range(position_in_partition_view p) {
    return _position_cmp(p, _upper_bound) >= 0;
 }

 inline
-void cache_streamed_mutation::start_reading_from_underlying() {
+void cache_flat_mutation_reader::start_reading_from_underlying() {
    clogger.trace("csm {}: start_reading_from_underlying(), range=[{}, {})", this, _lower_bound, _next_row_in_range ? _next_row.position() : _upper_bound);
    _state = state::move_to_underlying;
 }

 inline
-void cache_streamed_mutation::copy_from_cache_to_buffer() {
+void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
    clogger.trace("csm {}: copy_from_cache, next={}, next_row_in_range={}", this, _next_row.position(), _next_row_in_range);
    position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key());
    for (auto&& rts : _snp->range_tombstones(*_schema, _lower_bound, _next_row_in_range ? next_lower_bound : _upper_bound)) {
@@ -465,15 +451,14 @@ void cache_streamed_mutation::copy_from_cache_to_buffer() {
 }

 inline
-void cache_streamed_mutation::move_to_end() {
+void cache_flat_mutation_reader::move_to_end() {
    drain_tombstones();
-    _end_of_stream = true;
-    _state = state::end_of_stream;
+    finish_reader();
    clogger.trace("csm {}: eos", this);
 }

 inline
-void cache_streamed_mutation::move_to_next_range() {
+void cache_flat_mutation_reader::move_to_next_range() {
    auto next_it = std::next(_ck_ranges_curr);
    if (next_it == _ck_ranges_end) {
        move_to_end();
@@ -484,7 +469,7 @@ void cache_streamed_mutation::move_to_next_range() {
 }

 inline
-void cache_streamed_mutation::move_to_range(query::clustering_row_ranges::const_iterator next_it) {
+void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::const_iterator next_it) {
    auto lb = position_in_partition::for_range_start(*next_it);
    auto ub = position_in_partition_view::for_range_end(*next_it);
    _last_row = nullptr;
@@ -520,7 +505,7 @@ void cache_streamed_mutation::move_to_range(query::clustering_row_ranges::const_

 // _next_row must be inside the range.
 inline
-void cache_streamed_mutation::move_to_next_entry() {
+void cache_flat_mutation_reader::move_to_next_entry() {
    clogger.trace("csm {}: move_to_next_entry(), curr={}", this, _next_row.position());
    if (no_clustering_row_between(*_schema, _next_row.position(), _upper_bound)) {
        move_to_next_range();
@@ -538,7 +523,7 @@ void cache_streamed_mutation::move_to_next_entry() {
 }

 inline
-void cache_streamed_mutation::drain_tombstones(position_in_partition_view pos) {
+void cache_flat_mutation_reader::drain_tombstones(position_in_partition_view pos) {
    while (true) {
        reserve_one();
        auto mfo = _tombstones.get_next(pos);
@@ -550,7 +535,7 @@ void cache_streamed_mutation::drain_tombstones(position_in_partition_view pos) {
 }

 inline
-void cache_streamed_mutation::drain_tombstones() {
+void cache_flat_mutation_reader::drain_tombstones() {
    while (true) {
        reserve_one();
        auto mfo = _tombstones.get_next();
@@ -562,7 +547,7 @@ void cache_streamed_mutation::drain_tombstones() {
 }

 inline
-void cache_streamed_mutation::add_to_buffer(mutation_fragment&& mf) {
+void cache_flat_mutation_reader::add_to_buffer(mutation_fragment&& mf) {
    clogger.trace("csm {}: add_to_buffer({})", this, mf);
    if (mf.is_clustering_row()) {
        add_clustering_row_to_buffer(std::move(mf));
@@ -573,7 +558,7 @@ void cache_streamed_mutation::add_to_buffer(mutation_fragment&& mf) {
 }

 inline
-void cache_streamed_mutation::add_to_buffer(const partition_snapshot_row_cursor& row) {
+void cache_flat_mutation_reader::add_to_buffer(const partition_snapshot_row_cursor& row) {
    if (!row.dummy()) {
        _read_context->cache().on_row_hit();
        add_clustering_row_to_buffer(row.row());
@@ -584,7 +569,7 @@ void cache_streamed_mutation::add_to_buffer(const partition_snapshot_row_cursor&
 //   (1) no fragment with position >= _lower_bound was pushed yet
 //   (2) If _lower_bound > mf.position(), mf was emitted
 inline
-void cache_streamed_mutation::add_clustering_row_to_buffer(mutation_fragment&& mf) {
+void cache_flat_mutation_reader::add_clustering_row_to_buffer(mutation_fragment&& mf) {
    clogger.trace("csm {}: add_clustering_row_to_buffer({})", this, mf);
    auto& row = mf.as_clustering_row();
    auto key = row.key();
@@ -600,7 +585,7 @@ void cache_streamed_mutation::add_clustering_row_to_buffer(mutation_fragment&& m
 }

 inline
-void cache_streamed_mutation::add_to_buffer(range_tombstone&& rt) {
+void cache_flat_mutation_reader::add_to_buffer(range_tombstone&& rt) {
    clogger.trace("csm {}: add_to_buffer({})", this, rt);
    // This guarantees that rt starts after any emitted clustering_row
    if (!rt.trim_front(*_schema, _lower_bound)) {
@@ -612,7 +597,7 @@ void cache_streamed_mutation::add_to_buffer(range_tombstone&& rt) {
 }

 inline
-void cache_streamed_mutation::maybe_add_to_cache(const range_tombstone& rt) {
+void cache_flat_mutation_reader::maybe_add_to_cache(const range_tombstone& rt) {
    if (can_populate()) {
        clogger.trace("csm {}: maybe_add_to_cache({})", this, rt);
        _lsa_manager.run_in_update_section_with_allocator([&] {
@@ -624,7 +609,7 @@ void cache_streamed_mutation::maybe_add_to_cache(const range_tombstone& rt) {
 }

 inline
-void cache_streamed_mutation::maybe_add_to_cache(const static_row& sr) {
+void cache_flat_mutation_reader::maybe_add_to_cache(const static_row& sr) {
    if (can_populate()) {
        clogger.trace("csm {}: populate({})", this, sr);
        _read_context->cache().on_row_insert();
@@ -637,7 +622,7 @@ void cache_streamed_mutation::maybe_add_to_cache(const static_row& sr) {
 }

 inline
-void cache_streamed_mutation::maybe_set_static_row_continuous() {
+void cache_flat_mutation_reader::maybe_set_static_row_continuous() {
    if (can_populate()) {
        clogger.trace("csm {}: set static row continuous", this);
        _snp->version()->partition().set_static_row_continuous(true);
@@ -647,19 +632,19 @@ void cache_streamed_mutation::maybe_set_static_row_continuous() {
 }

 inline
-bool cache_streamed_mutation::can_populate() const {
+bool cache_flat_mutation_reader::can_populate() const {
    return _snp->at_latest_version() && _read_context->cache().phase_of(_read_context->key()) == _read_context->phase();
 }

 } // namespace cache

-inline streamed_mutation make_cache_streamed_mutation(schema_ptr s,
-                                                      dht::decorated_key dk,
-                                                      query::clustering_key_filter_ranges crr,
-                                                      row_cache& cache,
-                                                      lw_shared_ptr<cache::read_context> ctx,
-                                                      lw_shared_ptr<partition_snapshot> snp)
+inline flat_mutation_reader make_cache_flat_mutation_reader(schema_ptr s,
+                                                            dht::decorated_key dk,
+                                                            query::clustering_key_filter_ranges crr,
+                                                            row_cache& cache,
+                                                            lw_shared_ptr<cache::read_context> ctx,
+                                                            lw_shared_ptr<partition_snapshot> snp)
 {
-    return make_streamed_mutation<cache::cache_streamed_mutation>(
+    return make_flat_mutation_reader<cache::cache_flat_mutation_reader>(
        std::move(s), std::move(dk), std::move(crr), std::move(ctx), std::move(snp), cache);
 }
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -42,17 +42,6 @@ std::ostream& operator<<(std::ostream& out, const bound_kind k);
 bound_kind invert_kind(bound_kind k);
 int32_t weight(bound_kind k);

-static inline bound_kind flip_bound_kind(bound_kind bk)
-{
-    switch (bk) {
-    case bound_kind::excl_end: return bound_kind::excl_start;
-    case bound_kind::incl_end: return bound_kind::incl_start;
-    case bound_kind::excl_start: return bound_kind::excl_end;
-    case bound_kind::incl_start: return bound_kind::incl_end;
-    }
-    abort();
-}
-
 class bound_view {
 public:
    const static thread_local clustering_key empty_prefix;
--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -70,7 +70,7 @@ public:
    {
        if (!with_static_row) {
            if (_current == _end) {
-                _current_start = _current_end = position_in_partition_view::after_all_clustered_rows();
+                _current_start = position_in_partition_view::before_all_clustered_rows();
            } else {
                _current_start = position_in_partition_view::for_range_start(*_current);
                _current_end = position_in_partition_view::for_range_end(*_current);
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -241,7 +241,7 @@ public:
    using component_view = std::pair<bytes_view, eoc>;
 private:
    template<typename Value, typename = std::enable_if_t<!std::is_same<const data_value, std::decay_t<Value>>::value>>
-    static size_t size(Value& val) {
+    static size_t size(const Value& val) {
        return val.size();
    }
    static size_t size(const data_value& val) {
@@ -445,17 +445,16 @@ public:
        return _is_compound;
    }

-    // The following factory functions assume this composite is a compound value.
    template <typename ClusteringElement>
    static composite from_clustering_element(const schema& s, const ClusteringElement& ce) {
-        return serialize_value(ce.components(s));
+        return serialize_value(ce.components(s), s.is_compound());
    }

-    static composite from_exploded(const std::vector<bytes_view>& v, eoc marker = eoc::none) {
+    static composite from_exploded(const std::vector<bytes_view>& v, bool is_compound, eoc marker = eoc::none) {
        if (v.size() == 0) {
-            return composite(bytes(size_t(1), bytes::value_type(marker)));
+            return composite(bytes(size_t(1), bytes::value_type(marker)), is_compound);
        }
-        return serialize_value(v, true, marker);
+        return serialize_value(v, is_compound, marker);
    }

    static composite static_prefix(const schema& s) {
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -14,7 +14,7 @@
 # one logical cluster from joining another.
 # It is recommended to change the default value when creating a new cluster.
 # You can NOT modify this value for an existing cluster
-#cluster_name: 'ScyllaDB Cluster'
+#cluster_name: 'Test Cluster'

 # This defines the number of tokens randomly assigned to this node on the ring
 # The more tokens, relative to other nodes, the larger the proportion of data
@@ -87,6 +87,13 @@ listen_address: localhost
 # Leaving this blank will set it to the same value as listen_address
 # broadcast_address: 1.2.3.4

+
+# When using multiple physical network interfaces, set this to true to listen on broadcast_address
+# in addition to the listen_address, allowing nodes to communicate in both interfaces.
+# Ignore this property if the network configuration automatically routes between the public and private networks such as EC2.
+#
+# listen_on_broadcast_address: false
+
 # port for the CQL native transport to listen for clients on
 # For security reasons, you should not expose this port to the internet.  Firewall it if needed.
 native_transport_port: 9042
--- a/configure.py
+++ b/configure.py
@@ -189,7 +189,7 @@ scylla_tests = [
    'tests/perf/perf_simple_query',
    'tests/perf/perf_fast_forward',
    'tests/perf/perf_cache_eviction',
-    'tests/cache_streamed_mutation_test',
+    'tests/cache_flat_mutation_reader_test',
    'tests/row_cache_stress_test',
    'tests/memory_footprint',
    'tests/perf/perf_sstable',
@@ -703,6 +703,7 @@ warnings = [
    '-Wno-misleading-indentation',
    '-Wno-overflow',
    '-Wno-noexcept-type',
+    '-Wno-nonnull-compare'
    ]

 warnings = [w
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -606,6 +606,7 @@ void query_processor::migration_subscriber::on_drop_aggregate(const sstring& ks_
 }

 void query_processor::migration_subscriber::on_drop_view(const sstring& ks_name, const sstring& view_name) {
+    remove_invalid_prepared_statements(ks_name, view_name);
 }

 void query_processor::migration_subscriber::remove_invalid_prepared_statements(
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -101,6 +101,10 @@ public:
        return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->has_bound(b); });
    }

+    virtual bool is_inclusive(statements::bound b) const override {
+        return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->is_inclusive(b); });
+    }
+
    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
        return _restrictions->uses_function(ks_name, function_name);
    }
--- a/cql3/statements/create_user_statement.cc
+++ b/cql3/statements/create_user_statement.cc
@@ -78,7 +78,7 @@ cql3::statements::create_user_statement::execute(distributed<service::storage_pr
                throw exceptions::invalid_request_exception(sprint("User %s already exists", _username));
            }
            if (exists && _if_not_exists) {
-                make_ready_future<::shared_ptr<cql_transport::messages::result_message>>();
+                return make_ready_future<::shared_ptr<cql_transport::messages::result_message>>();
            }
            return auth_service.underlying_authenticator().create(_username, _opts->options()).then([this, &auth_service] {
                return auth_service.insert_user(_username, _superuser).then([] {
--- a/cql3/statements/delete_statement.cc
+++ b/cql3/statements/delete_statement.cc
@@ -106,6 +106,9 @@ delete_statement::prepare_internal(database& db, schema_ptr schema, shared_ptr<v
            || !stmt->restrictions()->get_clustering_columns_restrictions()->has_bound(bound::END)) {
        throw exceptions::invalid_request_exception("A range deletion operation needs to specify both bounds");
    }
+    if (!schema->is_compound() && stmt->restrictions()->get_clustering_columns_restrictions()->is_slice()) {
+        throw exceptions::invalid_request_exception("Range deletions on \"compact storage\" schemas are not supported");
+    }
    return stmt;
 }

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -489,8 +489,7 @@ indexed_table_select_statement::do_execute(distributed<service::storage_proxy>&
    ++_stats.reads;

    assert(_restrictions->uses_secondary_indexing());
-
-    return find_index_partition_ranges(proxy, state, options).then([&, this] (dht::partition_range_vector partition_ranges) {
+    return find_index_partition_ranges(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (dht::partition_range_vector partition_ranges) {
        auto command = ::make_lw_shared<query::read_command>(
                _schema->id(),
                _schema->version(),
--- a/database.cc
+++ b/database.cc
@@ -236,8 +236,8 @@ logalloc::occupancy_stats column_family::occupancy() const {
 }

 static
-bool belongs_to_current_shard(const streamed_mutation& m) {
-    return dht::shard_of(m.decorated_key().token()) == engine().cpu_id();
+bool belongs_to_current_shard(const dht::decorated_key& dk) {
+    return dht::shard_of(dk.token()) == engine().cpu_id();
 }

 // Stores ranges for all components of the same clustering key, index 0 referring to component
@@ -376,7 +376,6 @@ filter_sstable_for_reader(std::vector<sstables::shared_sstable>&& sstables, colu
 // selects readers on-demand as the read progresses through the token
 // range.
 class incremental_reader_selector : public reader_selector {
-    schema_ptr _s;
    const dht::partition_range* _pr;
    lw_shared_ptr<sstables::sstable_set> _sstables;
    const io_priority_class& _pc;
@@ -387,14 +386,11 @@ class incremental_reader_selector : public reader_selector {
    mutation_reader::forwarding _fwd_mr;
    sstables::sstable_set::incremental_selector _selector;
    std::unordered_set<sstables::shared_sstable> _read_sstables;
+    sstable_reader_factory_type _fn;

-    mutation_reader create_reader(sstables::shared_sstable sst) {
+    flat_mutation_reader create_reader(sstables::shared_sstable sst) {
        tracing::trace(_trace_state, "Reading partition range {} from sstable {}", *_pr, seastar::value_of([&sst] { return sst->get_filename(); }));
-        mutation_reader reader = sst->read_range_rows(_s, *_pr, _slice, _pc, _resource_tracker, _fwd, _fwd_mr);
-        if (sst->is_shared()) {
-            reader = make_filtering_reader(std::move(reader), belongs_to_current_shard);
-        }
-        return std::move(reader);
+        return _fn(sst, *_pr);
    }

 public:
@@ -406,8 +402,9 @@ public:
            reader_resource_tracker resource_tracker,
            tracing::trace_state_ptr trace_state,
            streamed_mutation::forwarding fwd,
-            mutation_reader::forwarding fwd_mr)
-        : _s(s)
+            mutation_reader::forwarding fwd_mr,
+            sstable_reader_factory_type fn)
+        : reader_selector(s, pr.start() ? pr.start()->value() : dht::ring_position::min())
        , _pr(&pr)
        , _sstables(std::move(sstables))
        , _pc(pc)
@@ -416,8 +413,8 @@ public:
        , _trace_state(std::move(trace_state))
        , _fwd(fwd)
        , _fwd_mr(fwd_mr)
-        , _selector(_sstables->make_incremental_selector()) {
-        _selector_position = _pr->start() ? _pr->start()->value().token() : dht::minimum_token();
+        , _selector(_sstables->make_incremental_selector())
+        , _fn(std::move(fn)) {

        dblog.trace("incremental_reader_selector {}: created for range: {} with {} sstables",
                this,
@@ -431,40 +428,46 @@ public:
    incremental_reader_selector(incremental_reader_selector&&) = delete;
    incremental_reader_selector& operator=(incremental_reader_selector&&) = delete;

-    virtual std::vector<mutation_reader> create_new_readers(const dht::token* const t) override {
+    virtual std::vector<flat_mutation_reader> create_new_readers(const dht::token* const t) override {
        dblog.trace("incremental_reader_selector {}: {}({})", this, __FUNCTION__, seastar::lazy_deref(t));

-        const auto& position = (t ? *t : _selector_position);
+        const auto& position = (t ? *t : _selector_position.token());
+        // we only pass _selector_position's token to _selector::select() when T is nullptr
+        // because it means gap between sstables, and the lower bound of the first interval
+        // after the gap is guaranteed to be inclusive.
        auto selection = _selector.select(position);

        if (selection.sstables.empty()) {
            // For the lower bound of the token range the _selector
            // might not return any sstables, in this case try again
            // with next_token unless it's maximum token.
-            if (!selection.next_token.is_maximum()
+            if (!selection.next_position.is_max()
                    && position == (_pr->start() ? _pr->start()->value().token() : dht::minimum_token())) {
                dblog.trace("incremental_reader_selector {}: no sstables intersect with the lower bound, retrying", this);
-                _selector_position = std::move(selection.next_token);
+                _selector_position = std::move(selection.next_position);
                return create_new_readers(nullptr);
            }

-            _selector_position = dht::maximum_token();
+            _selector_position = dht::ring_position::max();
            return {};
        }

-        _selector_position = std::move(selection.next_token);
+        _selector_position = std::move(selection.next_position);

        dblog.trace("incremental_reader_selector {}: {} new sstables to consider, advancing selector to {}", this, selection.sstables.size(), _selector_position);

-        return boost::copy_range<std::vector<mutation_reader>>(selection.sstables
+        return boost::copy_range<std::vector<flat_mutation_reader>>(selection.sstables
                | boost::adaptors::filtered([this] (auto& sst) { return _read_sstables.emplace(sst).second; })
-                | boost::adaptors::transformed([this] (auto& sst) { return this->create_reader(sst); }));
+                | boost::adaptors::transformed([this] (auto& sst) {
+                    return this->create_reader(sst);
+                }));
    }

-    virtual std::vector<mutation_reader> fast_forward_to(const dht::partition_range& pr) override {
+    virtual std::vector<flat_mutation_reader> fast_forward_to(const dht::partition_range& pr) override {
        _pr = &pr;

-        if (_pr->start()->value().token() >= _selector_position) {
+        dht::ring_position_comparator cmp(*_s);
+        if (cmp(dht::ring_position_view::for_range_start(*_pr), _selector_position) >= 0) {
            return create_new_readers(&_pr->start()->value().token());
        }

@@ -472,71 +475,35 @@ public:
    }
 };

-class single_key_sstable_reader final : public mutation_reader::impl {
-    column_family* _cf;
-    schema_ptr _schema;
-    const dht::partition_range& _pr;
-    sstables::key _key;
-    std::vector<streamed_mutation> _mutations;
-    bool _done = false;
-    lw_shared_ptr<sstables::sstable_set> _sstables;
-    utils::estimated_histogram& _sstable_histogram;
-    // Use a pointer instead of copying, so we don't need to regenerate the reader if
-    // the priority changes.
-    const io_priority_class& _pc;
-    const query::partition_slice& _slice;
-    reader_resource_tracker _resource_tracker;
-    tracing::trace_state_ptr _trace_state;
-    streamed_mutation::forwarding _fwd;
-public:
-    single_key_sstable_reader(column_family* cf,
-                              schema_ptr schema,
-                              lw_shared_ptr<sstables::sstable_set> sstables,
-                              utils::estimated_histogram& sstable_histogram,
-                              const dht::partition_range& pr, // must be singular
-                              const query::partition_slice& slice,
-                              const io_priority_class& pc,
-                              reader_resource_tracker resource_tracker,
-                              tracing::trace_state_ptr trace_state,
-                              streamed_mutation::forwarding fwd)
-        : _cf(cf)
-        , _schema(std::move(schema))
-        , _pr(pr)
-        , _key(sstables::key::from_partition_key(*_schema, *pr.start()->value().key()))
-        , _sstables(std::move(sstables))
-        , _sstable_histogram(sstable_histogram)
-        , _pc(pc)
-        , _slice(slice)
-        , _resource_tracker(std::move(resource_tracker))
-        , _trace_state(std::move(trace_state))
-        , _fwd(fwd)
-    { }
-
-    virtual future<streamed_mutation_opt> operator()() override {
-        if (_done) {
-            return make_ready_future<streamed_mutation_opt>();
-        }
-        auto candidates = filter_sstable_for_reader(_sstables->select(_pr), *_cf, _schema, _key, _slice);
-        return parallel_for_each(std::move(candidates),
-            [this](const sstables::shared_sstable& sstable) {
-                tracing::trace(_trace_state, "Reading key {} from sstable {}", _pr, seastar::value_of([&sstable] { return sstable->get_filename(); }));
-                return sstable->read_row(_schema, _pr.start()->value(), _slice, _pc, _resource_tracker, _fwd).then([this](auto smo) {
-                    if (smo) {
-                        _mutations.emplace_back(std::move(*smo));
-                    }
-                });
-        }).then([this] () -> streamed_mutation_opt {
-            _done = true;
-            if (_mutations.empty()) {
-                return { };
-            }
-            _sstable_histogram.add(_mutations.size());
-            return merge_mutations(std::move(_mutations));
-        });
+static flat_mutation_reader
+create_single_key_sstable_reader(column_family* cf,
+                                 schema_ptr schema,
+                                 lw_shared_ptr<sstables::sstable_set> sstables,
+                                 utils::estimated_histogram& sstable_histogram,
+                                 const dht::partition_range& pr, // must be singular
+                                 const query::partition_slice& slice,
+                                 const io_priority_class& pc,
+                                 reader_resource_tracker resource_tracker,
+                                 tracing::trace_state_ptr trace_state,
+                                 streamed_mutation::forwarding fwd,
+                                 mutation_reader::forwarding fwd_mr)
+{
+    auto key = sstables::key::from_partition_key(*schema, *pr.start()->value().key());
+    auto readers = boost::copy_range<std::vector<flat_mutation_reader>>(
+        filter_sstable_for_reader(sstables->select(pr), *cf, schema, key, slice)
+        | boost::adaptors::transformed([&] (const sstables::shared_sstable& sstable) {
+            tracing::trace(trace_state, "Reading key {} from sstable {}", pr, seastar::value_of([&sstable] { return sstable->get_filename(); }));
+            return sstable->read_row_flat(schema, pr.start()->value(), slice, pc, resource_tracker, fwd);
+        })
+    );
+    if (readers.empty()) {
+        return make_empty_flat_reader(schema);
    }
-};
+    sstable_histogram.add(readers.size());
+    return make_combined_reader(schema, std::move(readers), fwd, fwd_mr);
+}

-mutation_reader
+flat_mutation_reader
 column_family::make_sstable_reader(schema_ptr s,
                                   lw_shared_ptr<sstables::sstable_set> sstables,
                                   const dht::partition_range& pr,
@@ -545,9 +512,9 @@ column_family::make_sstable_reader(schema_ptr s,
                                   tracing::trace_state_ptr trace_state,
                                   streamed_mutation::forwarding fwd,
                                   mutation_reader::forwarding fwd_mr) const {
-    auto& config = service::get_local_streaming_read_priority().id() == pc.id()
-        ? _config.streaming_read_concurrency_config
-        : _config.read_concurrency_config;
+    auto* semaphore = service::get_local_streaming_read_priority().id() == pc.id()
+        ? _config.streaming_read_concurrency_semaphore
+        : _config.read_concurrency_semaphore;

    // CAVEAT: if make_sstable_reader() is called on a single partition
    // we want to optimize and read exactly this partition. As a
@@ -556,47 +523,45 @@ column_family::make_sstable_reader(schema_ptr s,
    if (pr.is_singular() && pr.start()->value().has_key()) {
        const dht::ring_position& pos = pr.start()->value();
        if (dht::shard_of(pos.token()) != engine().cpu_id()) {
-            return make_empty_reader(); // range doesn't belong to this shard
+            return make_empty_flat_reader(s); // range doesn't belong to this shard
        }

-        if (config.resources_sem) {
-            auto ms = mutation_source([&config, sstables=std::move(sstables), this] (
+        if (semaphore) {
+            auto ms = mutation_source([semaphore, this, sstables=std::move(sstables)] (
                        schema_ptr s,
                        const dht::partition_range& pr,
                        const query::partition_slice& slice,
                        const io_priority_class& pc,
                        tracing::trace_state_ptr trace_state,
                        streamed_mutation::forwarding fwd,
-                        mutation_reader::forwarding fwd_mr) {
-                    return make_mutation_reader<single_key_sstable_reader>(const_cast<column_family*>(this), std::move(s), std::move(sstables),
-                                _stats.estimated_sstable_per_read, pr, slice, pc, reader_resource_tracker(config.resources_sem), std::move(trace_state), fwd);
+                        mutation_reader::forwarding fwd_mr,
+                        reader_resource_tracker tracker) {
+                    return create_single_key_sstable_reader(const_cast<column_family*>(this), std::move(s), std::move(sstables),
+                                _stats.estimated_sstable_per_read, pr, slice, pc, tracker, std::move(trace_state), fwd, fwd_mr);
                });
-            return make_restricted_reader(config, std::move(ms), std::move(s), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
+            return make_restricted_flat_reader(*semaphore, std::move(ms), std::move(s), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
        } else {
-            return make_mutation_reader<single_key_sstable_reader>(const_cast<column_family*>(this), std::move(s), std::move(sstables),
-                        _stats.estimated_sstable_per_read, pr, slice, pc, no_resource_tracking(), std::move(trace_state), fwd);
+            return create_single_key_sstable_reader(const_cast<column_family*>(this), std::move(s), std::move(sstables),
+                        _stats.estimated_sstable_per_read, pr, slice, pc, no_resource_tracking(), std::move(trace_state), fwd, fwd_mr);
        }
    } else {
-        if (config.resources_sem) {
-            auto ms = mutation_source([&config, sstables=std::move(sstables)] (
+        if (semaphore) {
+            auto ms = mutation_source([semaphore, sstables=std::move(sstables)] (
                        schema_ptr s,
                        const dht::partition_range& pr,
                        const query::partition_slice& slice,
                        const io_priority_class& pc,
                        tracing::trace_state_ptr trace_state,
                        streamed_mutation::forwarding fwd,
-                        mutation_reader::forwarding fwd_mr) {
-                    return make_mutation_reader<combined_mutation_reader>(
-                            std::make_unique<incremental_reader_selector>(std::move(s), std::move(sstables), pr, slice, pc,
-                                    reader_resource_tracker(config.resources_sem), std::move(trace_state), fwd, fwd_mr),
-                            fwd_mr);
+                        mutation_reader::forwarding fwd_mr,
+                        reader_resource_tracker tracker) {
+                    return make_local_shard_sstable_reader(std::move(s), std::move(sstables), pr, slice, pc,
+                        tracker, std::move(trace_state), fwd, fwd_mr);
                });
-            return make_restricted_reader(config, std::move(ms), std::move(s), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
+            return make_restricted_flat_reader(*semaphore, std::move(ms), std::move(s), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
        } else {
-            return make_mutation_reader<combined_mutation_reader>(
-                    std::make_unique<incremental_reader_selector>(std::move(s), std::move(sstables), pr, slice, pc,
-                            no_resource_tracking(), std::move(trace_state), fwd, fwd_mr),
-                    fwd_mr);
+            return make_local_shard_sstable_reader(std::move(s), std::move(sstables), pr, slice, pc,
+                no_resource_tracking(), std::move(trace_state), fwd, fwd_mr);
        }
    }
 }
@@ -605,10 +570,8 @@ column_family::make_sstable_reader(schema_ptr s,
 future<column_family::const_mutation_partition_ptr>
 column_family::find_partition(schema_ptr s, const dht::decorated_key& key) const {
    return do_with(dht::partition_range::make_singular(key), [s = std::move(s), this] (auto& range) {
-        return do_with(this->make_reader(s, range), [] (mutation_reader& reader) {
-            return reader().then([] (auto sm) {
-                return mutation_from_streamed_mutation(std::move(sm));
-            }).then([] (mutation_opt&& mo) -> std::unique_ptr<const mutation_partition> {
+        return do_with(this->make_reader(s, range), [s] (flat_mutation_reader& reader) {
+            return read_mutation_from_flat_mutation_reader(reader).then([] (mutation_opt&& mo) -> std::unique_ptr<const mutation_partition> {
                if (!mo) {
                    return {};
                }
@@ -639,7 +602,7 @@ column_family::find_row(schema_ptr s, const dht::decorated_key& partition_key, c
    });
 }

-mutation_reader
+flat_mutation_reader
 column_family::make_reader(schema_ptr s,
                           const dht::partition_range& range,
                           const query::partition_slice& slice,
@@ -648,10 +611,10 @@ column_family::make_reader(schema_ptr s,
                           streamed_mutation::forwarding fwd,
                           mutation_reader::forwarding fwd_mr) const {
    if (_virtual_reader) {
-        return (*_virtual_reader)(s, range, slice, pc, trace_state, fwd, fwd_mr);
+        return (*_virtual_reader).make_flat_mutation_reader(s, range, slice, pc, trace_state, fwd, fwd_mr);
    }

-    std::vector<mutation_reader> readers;
+    std::vector<flat_mutation_reader> readers;
    readers.reserve(_memtables->size() + 1);

    // We're assuming that cache and memtables are both read atomically
@@ -675,16 +638,16 @@ column_family::make_reader(schema_ptr s,
    // https://github.com/scylladb/scylla/issues/185

    for (auto&& mt : *_memtables) {
-        readers.emplace_back(mt->make_reader(s, range, slice, pc, trace_state, fwd, fwd_mr));
+        readers.emplace_back(mt->make_flat_reader(s, range, slice, pc, trace_state, fwd, fwd_mr));
    }

    if (_config.enable_cache) {
-        readers.emplace_back(_cache.make_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
+        readers.emplace_back(_cache.make_flat_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
    } else {
        readers.emplace_back(make_sstable_reader(s, _sstables, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
    }

-    return make_combined_reader(std::move(readers), fwd_mr);
+    return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
 }

 flat_mutation_reader
@@ -695,16 +658,16 @@ column_family::make_streaming_reader(schema_ptr s,

    auto source = mutation_source([this] (schema_ptr s, const dht::partition_range& range, const query::partition_slice& slice,
                                      const io_priority_class& pc, tracing::trace_state_ptr trace_state, streamed_mutation::forwarding fwd, mutation_reader::forwarding fwd_mr) {
-        std::vector<mutation_reader> readers;
+        std::vector<flat_mutation_reader> readers;
        readers.reserve(_memtables->size() + 1);
        for (auto&& mt : *_memtables) {
-            readers.emplace_back(mt->make_reader(s, range, slice, pc, trace_state, fwd, fwd_mr));
+            readers.emplace_back(mt->make_flat_reader(s, range, slice, pc, trace_state, fwd, fwd_mr));
        }
        readers.emplace_back(make_sstable_reader(s, _sstables, range, slice, pc, std::move(trace_state), fwd, fwd_mr));
-        return make_combined_reader(std::move(readers), fwd_mr);
+        return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
    });

-    return make_flat_multi_range_reader(s, std::move(source), ranges, slice, pc, nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
+    return make_flat_multi_range_reader(s, std::move(source), ranges, slice, pc, nullptr, mutation_reader::forwarding::no);
 }

 future<std::vector<locked_cell>> column_family::lock_counter_cells(const mutation& m, timeout_clock::time_point timeout) {
@@ -720,7 +683,7 @@ column_family::for_all_partitions(schema_ptr s, Func&& func) const {
                  "bad Func signature");

    struct iteration_state {
-        mutation_reader reader;
+        flat_mutation_reader reader;
        Func func;
        bool ok = true;
        bool empty = false;
@@ -734,9 +697,7 @@ column_family::for_all_partitions(schema_ptr s, Func&& func) const {

    return do_with(iteration_state(std::move(s), *this, std::move(func)), [] (iteration_state& is) {
        return do_until([&is] { return is.done(); }, [&is] {
-            return is.reader().then([] (auto sm) {
-                return mutation_from_streamed_mutation(std::move(sm));
-            }).then([&is](mutation_opt&& mo) {
+            return read_mutation_from_flat_mutation_reader(is.reader).then([&is](mutation_opt&& mo) {
                if (!mo) {
                    is.empty = true;
                } else {
@@ -915,7 +876,8 @@ column_family::seal_active_streaming_memtable_immediate(flush_permit&& permit) {
                    adder();
                    return old->clear_gently();
                }
-            }).handle_exception([old, permit = std::move(permit)] (auto ep) {
+            }).handle_exception([old, permit = std::move(permit), newtab] (auto ep) {
+                newtab->mark_for_deletion();
                dblog.error("failed to write streamed sstable: {}", ep);
                return make_exception_future<>(ep);
            });
@@ -954,7 +916,8 @@ future<> column_family::seal_active_streaming_memtable_big(streaming_memtable_bi
                auto monitor = seastar::make_shared<permit_monitor>(permit.release_sstable_write_permit());
                return write_memtable_to_sstable(*old, newtab, std::move(monitor), incremental_backups_enabled(), priority, true, _config.background_writer_scheduling_group).then([this, newtab, old, &smb, permit = std::move(permit)] {
                    smb.sstables.emplace_back(newtab);
-                }).handle_exception([] (auto ep) {
+                }).handle_exception([newtab] (auto ep) {
+                    newtab->mark_for_deletion();
                    dblog.error("failed to write streamed sstable: {}", ep);
                    return make_exception_future<>(ep);
                });
@@ -2040,6 +2003,18 @@ database::database(const db::config& cfg)
    , _memtable_cpu_controller(make_flush_cpu_controller(*_cfg, &_background_writer_scheduling_group, [this, limit = 2.0f * _dirty_memory_manager.throttle_threshold()] {
        return (_dirty_memory_manager.virtual_dirty_memory()) / limit;
    }))
+    , _read_concurrency_sem(max_count_concurrent_reads,
+        max_memory_concurrent_reads(),
+        _cfg->read_request_timeout_in_ms() * 1ms,
+        max_inactive_queue_length(),
+        [this] {
+            ++_stats->sstable_read_queue_overloaded;
+            return std::make_exception_ptr(std::runtime_error("sstable inactive read queue overloaded"));
+        })
+    // No timeouts or queue length limits - a failure here can kill an entire repair.
+    // Trust the caller to limit concurrency.
+    , _streaming_concurrency_sem(max_count_streaming_concurrent_reads, max_memory_streaming_concurrent_reads())
+    , _system_read_concurrency_sem(max_count_system_concurrent_reads, max_memory_system_concurrent_reads())
    , _version(empty_version)
    , _compaction_manager(std::make_unique<compaction_manager>())
    , _enable_incremental_backups(cfg.incremental_backups())
@@ -2092,9 +2067,7 @@ dirty_memory_manager::setup_collectd(sstring namestr) {
    });
 }

-static const metrics::label user_label("user");
-static const metrics::label streaming_label("streaming");
-static const metrics::label system_label("system");
+static const metrics::label class_label("class");

 void
 database::setup_metrics() {
@@ -2104,9 +2077,9 @@ database::setup_metrics() {

    namespace sm = seastar::metrics;

-    auto user_label_instance = user_label("reads");
-    auto streaming_label_instance = streaming_label("reads");
-    auto system_label_instance = system_label("reads");
+    auto user_label_instance = class_label("user");
+    auto streaming_label_instance = class_label("streaming");
+    auto system_label_instance = class_label("system");

    _metrics.add_group("memory", {
        sm::make_gauge("dirty_bytes", [this] { return _dirty_memory_manager.real_dirty_memory() + _system_dirty_memory_manager.real_dirty_memory() + _streaming_dirty_memory_manager.real_dirty_memory(); },
@@ -2173,11 +2146,11 @@ database::setup_metrics() {
                       sm::description("Counts the number of times the sstable read queue was overloaded. "
                                       "A non-zero value indicates that we have to drop read requests because they arrive faster than we can serve them.")),

-        sm::make_gauge("active_reads", [this] { return _stats->active_reads; },
+        sm::make_gauge("active_reads", [this] { return max_count_concurrent_reads - _read_concurrency_sem.available_resources().count; },
                       sm::description("Holds the number of currently active read operations. "),
                       {user_label_instance}),

-        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_concurrent_reads() - _read_concurrency_sem.available_units(); },
+        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_concurrent_reads() - _read_concurrency_sem.available_resources().memory; },
                       sm::description(seastar::format("Holds the amount of memory consumed by currently active read operations. "
                                                       "If this value gets close to {} we are likely to start dropping new read requests. "
                                                       "In that case sstable_read_queue_overloads is going to get a non-zero value.", max_memory_concurrent_reads())),
@@ -2187,12 +2160,12 @@ database::setup_metrics() {
                       sm::description("Holds the number of currently queued read operations."),
                       {user_label_instance}),

-        sm::make_gauge("active_reads", [this] { return _stats->active_reads_streaming; },
+        sm::make_gauge("active_reads", [this] { return max_count_streaming_concurrent_reads - _streaming_concurrency_sem.available_resources().count; },
                       sm::description("Holds the number of currently active read operations issued on behalf of streaming "),
                       {streaming_label_instance}),


-        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_streaming_concurrent_reads() - _streaming_concurrency_sem.available_units(); },
+        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_streaming_concurrent_reads() - _streaming_concurrency_sem.available_resources().memory; },
                       sm::description(seastar::format("Holds the amount of memory consumed by currently active read operations issued on behalf of streaming "
                                                       "If this value gets close to {} we are likely to start dropping new read requests. "
                                                       "In that case sstable_read_queue_overloads is going to get a non-zero value.", max_memory_streaming_concurrent_reads())),
@@ -2202,11 +2175,11 @@ database::setup_metrics() {
                       sm::description("Holds the number of currently queued read operations on behalf of streaming."),
                       {streaming_label_instance}),

-        sm::make_gauge("active_reads", [this] { return _stats->active_reads_system_keyspace; },
+        sm::make_gauge("active_reads", [this] { return max_count_system_concurrent_reads - _system_read_concurrency_sem.available_resources().count; },
                       sm::description("Holds the number of currently active read operations from \"system\" keyspace tables. "),
                       {system_label_instance}),

-        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_system_concurrent_reads() - _system_read_concurrency_sem.available_units(); },
+        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_system_concurrent_reads() - _system_read_concurrency_sem.available_resources().memory; },
                       sm::description(seastar::format("Holds the amount of memory consumed by currently active read operations from \"system\" keyspace tables. "
                                                       "If this value gets close to {} we are likely to start dropping new read requests. "
                                                       "In that case sstable_read_queue_overloads is going to get a non-zero value.", max_memory_system_concurrent_reads())),
@@ -2688,8 +2661,8 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
    cfg.enable_cache = _config.enable_cache;
    cfg.dirty_memory_manager = _config.dirty_memory_manager;
    cfg.streaming_dirty_memory_manager = _config.streaming_dirty_memory_manager;
-    cfg.read_concurrency_config = _config.read_concurrency_config;
-    cfg.streaming_read_concurrency_config = _config.streaming_read_concurrency_config;
+    cfg.read_concurrency_semaphore = _config.read_concurrency_semaphore;
+    cfg.streaming_read_concurrency_semaphore = _config.streaming_read_concurrency_semaphore;
    cfg.cf_stats = _config.cf_stats;
    cfg.enable_incremental_backups = _config.enable_incremental_backups;
    cfg.background_writer_scheduling_group = _config.background_writer_scheduling_group;
@@ -3047,17 +3020,6 @@ void database::register_connection_drop_notifier(netw::messaging_service& ms) {
    });
 }

-std::ostream& operator<<(std::ostream& out, const atomic_cell_or_collection& c) {
-    return out << to_hex(c._data);
-}
-
-std::ostream& operator<<(std::ostream& os, const mutation& m) {
-    const ::schema& s = *m.schema();
-    fprint(os, "{%s.%s key %s data ", s.ks_name(), s.cf_name(), m.decorated_key());
-    os << m.partition() << "}";
-    return os;
-}
-
 std::ostream& operator<<(std::ostream& out, const column_family& cf) {
    return fprint(out, "{column_family: %s/%s}", cf._schema->ks_name(), cf._schema->cf_name());
 }
@@ -3438,18 +3400,8 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
    }
    cfg.dirty_memory_manager = &_dirty_memory_manager;
    cfg.streaming_dirty_memory_manager = &_streaming_dirty_memory_manager;
-    cfg.read_concurrency_config.resources_sem = &_read_concurrency_sem;
-    cfg.read_concurrency_config.active_reads = &_stats->active_reads;
-    cfg.read_concurrency_config.timeout = _cfg->read_request_timeout_in_ms() * 1ms;
-    cfg.read_concurrency_config.max_queue_length = 100;
-    cfg.read_concurrency_config.raise_queue_overloaded_exception = [this] {
-        ++_stats->sstable_read_queue_overloaded;
-        throw std::runtime_error("sstable inactive read queue overloaded");
-    };
-    // No timeouts or queue length limits - a failure here can kill an entire repair.
-    // Trust the caller to limit concurrency.
-    cfg.streaming_read_concurrency_config.resources_sem = &_streaming_concurrency_sem;
-    cfg.streaming_read_concurrency_config.active_reads = &_stats->active_reads_streaming;
+    cfg.read_concurrency_semaphore = &_read_concurrency_sem;
+    cfg.streaming_read_concurrency_semaphore = &_streaming_concurrency_sem;
    cfg.cf_stats = &_cf_stats;
    cfg.enable_incremental_backups = _enable_incremental_backups;

@@ -3946,14 +3898,14 @@ future<> column_family::flush_streaming_mutations(utils::UUID plan_id, dht::part
            return _streaming_memtables->seal_active_memtable_delayed().then([this] {
                return _streaming_flush_phaser.advance_and_await();
            }).then([this, sstables = std::move(sstables), ranges = std::move(ranges)] () mutable {
-                return _cache.invalidate([this, sstables = std::move(sstables), ranges = std::move(ranges)] () mutable noexcept {
+                return _cache.invalidate([this, sstables = std::move(sstables)] () mutable noexcept {
                    // FIXME: this is not really noexcept, but we need to provide strong exception guarantees.
                    for (auto&& sst : sstables) {
                        // seal_active_streaming_memtable_big() ensures sst is unshared.
                        this->add_sstable(sst, {engine().cpu_id()});
                    }
                    this->try_trigger_compaction();
-                });
+                }, std::move(ranges));
            });
        });
    });
@@ -4268,7 +4220,7 @@ void column_family::drop_hit_rate(gms::inet_address addr) {
    _cluster_cache_hit_rates.erase(addr);
 }

-mutation_reader make_range_sstable_reader(schema_ptr s,
+flat_mutation_reader make_local_shard_sstable_reader(schema_ptr s,
        lw_shared_ptr<sstables::sstable_set> sstables,
        const dht::partition_range& pr,
        const query::partition_slice& slice,
@@ -4278,15 +4230,49 @@ mutation_reader make_range_sstable_reader(schema_ptr s,
        streamed_mutation::forwarding fwd,
        mutation_reader::forwarding fwd_mr)
 {
-    return make_mutation_reader<combined_mutation_reader>(std::make_unique<incremental_reader_selector>(std::move(s),
-                std::move(sstables),
-                pr,
-                slice,
-                pc,
-                std::move(resource_tracker),
-                std::move(trace_state),
-                fwd,
-                fwd_mr), fwd_mr);
+    auto reader_factory_fn = [s, &slice, &pc, resource_tracker, fwd, fwd_mr] (sstables::shared_sstable& sst, const dht::partition_range& pr) {
+        flat_mutation_reader reader = sst->read_range_rows_flat(s, pr, slice, pc, resource_tracker, fwd, fwd_mr);
+        if (sst->is_shared()) {
+            using sig = bool (&)(const dht::decorated_key&);
+            reader = make_filtering_reader(std::move(reader), sig(belongs_to_current_shard));
+        }
+        return reader;
+    };
+    auto all_readers = boost::copy_range<std::vector<flat_mutation_reader>>(
+            *sstables->all()
+            | boost::adaptors::transformed([&] (sstables::shared_sstable sst) -> flat_mutation_reader {
+                return reader_factory_fn(sst, pr);
+            })
+    );
+    return make_combined_reader(s,
+            std::move(all_readers),
+            fwd,
+            fwd_mr);
+}
+
+flat_mutation_reader make_range_sstable_reader(schema_ptr s,
+        lw_shared_ptr<sstables::sstable_set> sstables,
+        const dht::partition_range& pr,
+        const query::partition_slice& slice,
+        const io_priority_class& pc,
+        reader_resource_tracker resource_tracker,
+        tracing::trace_state_ptr trace_state,
+        streamed_mutation::forwarding fwd,
+        mutation_reader::forwarding fwd_mr)
+{
+    auto reader_factory_fn = [s, &slice, &pc, resource_tracker, fwd, fwd_mr] (sstables::shared_sstable& sst, const dht::partition_range& pr) {
+        return sst->read_range_rows_flat(s, pr, slice, pc, resource_tracker, fwd, fwd_mr);
+    };
+    auto sstable_readers = boost::copy_range<std::vector<flat_mutation_reader>>(
+            *sstables->all()
+            | boost::adaptors::transformed([&] (sstables::shared_sstable sst) {
+                return reader_factory_fn(sst, pr);
+            })
+    );
+    return make_combined_reader(s,
+            std::move(sstable_readers),
+            fwd,
+            fwd_mr);
 }

 future<>
@@ -4300,7 +4286,8 @@ write_memtable_to_sstable(memtable& mt, sstables::shared_sstable sst,
    cfg.leave_unsealed = leave_unsealed;
    cfg.thread_scheduling_group = tsg;
    cfg.monitor = std::move(monitor);
-    return sst->write_components(mt.make_flush_reader(mt.schema(), pc), mt.partition_count(), mt.schema(), cfg, pc);
+    return sst->write_components(mt.make_flush_reader(mt.schema(), pc),
+                                 mt.partition_count(), mt.schema(), cfg, pc);
 }

 future<>
--- a/database.hh
+++ b/database.hh
@@ -79,7 +79,7 @@
 #include "utils/phased_barrier.hh"
 #include "cpu_controller.hh"
 #include "dirty_memory_manager.hh"
-#include "reader_resource_tracker.hh"
+#include "reader_concurrency_semaphore.hh"

 class cell_locker;
 class cell_locker_stats;
@@ -296,8 +296,8 @@ public:
        bool enable_incremental_backups = false;
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
-        restricted_mutation_reader_config read_concurrency_config;
-        restricted_mutation_reader_config streaming_read_concurrency_config;
+        reader_concurrency_semaphore* read_concurrency_semaphore;
+        reader_concurrency_semaphore* streaming_read_concurrency_semaphore;
        ::cf_stats* cf_stats = nullptr;
        seastar::thread_scheduling_group* background_writer_scheduling_group = nullptr;
        seastar::thread_scheduling_group* memtable_scheduling_group = nullptr;
@@ -503,7 +503,7 @@ private:
    // Caller needs to ensure that column_family remains live (FIXME: relax this).
    // The 'range' parameter must be live as long as the reader is used.
    // Mutations returned by the reader will all have given schema.
-    mutation_reader make_sstable_reader(schema_ptr schema,
+    flat_mutation_reader make_sstable_reader(schema_ptr schema,
                                        lw_shared_ptr<sstables::sstable_set> sstables,
                                        const dht::partition_range& range,
                                        const query::partition_slice& slice,
@@ -570,7 +570,7 @@ public:
    // Mutations returned by the reader will all have given schema.
    // If I/O needs to be issued to read anything in the specified range, the operations
    // will be scheduled under the priority class given by pc.
-    mutation_reader make_reader(schema_ptr schema,
+    flat_mutation_reader make_reader(schema_ptr schema,
            const dht::partition_range& range,
            const query::partition_slice& slice,
            const io_priority_class& pc = default_priority_class(),
@@ -578,7 +578,7 @@ public:
            streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;

-    mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
+    flat_mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
        auto& full_slice = schema->full_slice();
        return make_reader(std::move(schema), range, full_slice);
    }
@@ -841,7 +841,20 @@ public:
    friend class distributed_loader;
 };

-mutation_reader make_range_sstable_reader(schema_ptr s,
+using sstable_reader_factory_type = std::function<flat_mutation_reader(sstables::shared_sstable&, const dht::partition_range& pr)>;
+
+// Filters out mutation that doesn't belong to current shard.
+flat_mutation_reader make_local_shard_sstable_reader(schema_ptr s,
+        lw_shared_ptr<sstables::sstable_set> sstables,
+        const dht::partition_range& pr,
+        const query::partition_slice& slice,
+        const io_priority_class& pc,
+        reader_resource_tracker resource_tracker,
+        tracing::trace_state_ptr trace_state,
+        streamed_mutation::forwarding fwd,
+        mutation_reader::forwarding fwd_mr);
+
+flat_mutation_reader make_range_sstable_reader(schema_ptr s,
        lw_shared_ptr<sstables::sstable_set> sstables,
        const dht::partition_range& pr,
        const query::partition_slice& slice,
@@ -941,8 +954,8 @@ public:
        bool enable_incremental_backups = false;
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
-        restricted_mutation_reader_config read_concurrency_config;
-        restricted_mutation_reader_config streaming_read_concurrency_config;
+        reader_concurrency_semaphore* read_concurrency_semaphore;
+        reader_concurrency_semaphore* streaming_read_concurrency_semaphore;
        ::cf_stats* cf_stats = nullptr;
        seastar::thread_scheduling_group* background_writer_scheduling_group = nullptr;
        seastar::thread_scheduling_group* memtable_scheduling_group = nullptr;
@@ -1028,10 +1041,17 @@ public:
    using timeout_clock = lowres_clock;
 private:
    ::cf_stats _cf_stats;
+    static const size_t max_count_concurrent_reads{100};
    static size_t max_memory_concurrent_reads() { return memory::stats().total_memory() * 0.02; }
+    // Assume a queued read takes up 10kB of memory, and allow 2% of memory to be filled up with such reads.
+    static size_t max_inactive_queue_length() { return memory::stats().total_memory() * 0.02 / 10000; }
+    // They're rather heavyweight, so limit more
+    static const size_t max_count_streaming_concurrent_reads{10};
    static size_t max_memory_streaming_concurrent_reads() { return memory::stats().total_memory() * 0.02; }
+    static const size_t max_count_system_concurrent_reads{10};
    static size_t max_memory_system_concurrent_reads() { return memory::stats().total_memory() * 0.02; };
    static constexpr size_t max_concurrent_sstable_loads() { return 3; }
+
    struct db_stats {
        uint64_t total_writes = 0;
        uint64_t total_writes_failed = 0;
@@ -1040,10 +1060,6 @@ private:
        uint64_t total_reads_failed = 0;
        uint64_t sstable_read_queue_overloaded = 0;

-        uint64_t active_reads = 0;
-        uint64_t active_reads_streaming = 0;
-        uint64_t active_reads_system_keyspace = 0;
-
        uint64_t short_data_queries = 0;
        uint64_t short_mutation_queries = 0;
    };
@@ -1060,11 +1076,9 @@ private:
    seastar::thread_scheduling_group _background_writer_scheduling_group;
    flush_cpu_controller _memtable_cpu_controller;

-    semaphore _read_concurrency_sem{max_memory_concurrent_reads()};
-    semaphore _streaming_concurrency_sem{max_memory_streaming_concurrent_reads()};
-    restricted_mutation_reader_config _read_concurrency_config;
-    semaphore _system_read_concurrency_sem{max_memory_system_concurrent_reads()};
-    restricted_mutation_reader_config _system_read_concurrency_config;
+    reader_concurrency_semaphore _read_concurrency_sem;
+    reader_concurrency_semaphore _streaming_concurrency_sem;
+    reader_concurrency_semaphore _system_read_concurrency_sem;

    semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};

@@ -1232,7 +1246,7 @@ public:
    std::unordered_set<sstring> get_initial_tokens();
    std::experimental::optional<gms::inet_address> get_replace_address();
    bool is_replacing();
-    semaphore& system_keyspace_read_concurrency_sem() {
+    reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
        return _system_read_concurrency_sem;
    }
    semaphore& sstable_load_concurrency_sem() {
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -718,7 +718,7 @@ public:
         */
        auto me = shared_from_this();
        auto fp = _file_pos;
-        return _pending_ops.wait_for_pending(timeout).then([me = std::move(me), fp, timeout] {
+        return _pending_ops.wait_for_pending(timeout).then([me, fp, timeout] {
            if (fp != me->_file_pos) {
                // some other request already wrote this buffer.
                // If so, wait for the operation at our intended file offset
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -64,8 +64,11 @@
 #include "db/config.hh"
 #include "md5_hasher.hh"

+#include <seastar/util/noncopyable_function.hh>
+
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/range/algorithm/copy.hpp>
+#include <boost/range/algorithm/transform.hpp>
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/join.hpp>

@@ -126,7 +129,11 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
    std::map<qualified_name, schema_mutations>&& views_before,
    std::map<qualified_name, schema_mutations>&& views_after);

-static void merge_types(distributed<service::storage_proxy>& proxy,
+struct user_types_to_drop final {
+    seastar::noncopyable_function<void()> drop;
+};
+
+[[nodiscard]] static user_types_to_drop merge_types(distributed<service::storage_proxy>& proxy,
    schema_result&& before,
    schema_result&& after);

@@ -832,7 +839,7 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
 #endif

       std::set<sstring> keyspaces_to_drop = merge_keyspaces(proxy, std::move(old_keyspaces), std::move(new_keyspaces)).get0();
-       merge_types(proxy, std::move(old_types), std::move(new_types));
+       auto types_to_drop = merge_types(proxy, std::move(old_types), std::move(new_types));
       merge_tables_and_views(proxy,
            std::move(old_column_families), std::move(new_column_families),
            std::move(old_views), std::move(new_views));
@@ -840,6 +847,8 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
       mergeFunctions(oldFunctions, newFunctions);
       mergeAggregates(oldAggregates, newAggregates);
 #endif
+       types_to_drop.drop();
+
       proxy.local().get_db().invoke_on_all([keyspaces_to_drop = std::move(keyspaces_to_drop)] (database& db) {
           // it is safe to drop a keyspace only when all nested ColumnFamilies where deleted
           return do_for_each(keyspaces_to_drop, [&db] (auto keyspace_to_drop) {
@@ -996,30 +1005,37 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
    }).get();
 }

-static inline void collect_types(std::set<sstring>& keys, schema_result& result, std::vector<user_type>& to)
+struct naked_user_type {
+    const sstring keyspace;
+    const sstring qualified_name;
+};
+
+static inline void collect_types(std::set<sstring>& keys, schema_result& result, std::vector<naked_user_type>& to)
 {
    for (auto&& key : keys) {
        auto&& value = result[key];
        auto types = create_types_from_schema_partition(schema_result_value_type{key, std::move(value)});
-        std::move(types.begin(), types.end(), std::back_inserter(to));
+        boost::transform(types, std::back_inserter(to), [] (user_type type) {
+            return naked_user_type{std::move(type->_keyspace), std::move(type->name())};
+        });
    }
 }

- // see the comments for merge_keyspaces()
-static void merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
+// see the comments for merge_keyspaces()
+[[nodiscard]] static user_types_to_drop merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
 {
-    std::vector<user_type> created, altered, dropped;
+    std::vector<naked_user_type> created, altered, dropped;

    auto diff = difference(before, after, indirect_equal_to<lw_shared_ptr<query::result_set>>());

    collect_types(diff.entries_only_on_left, before, dropped); // Keyspaces with no more types
    collect_types(diff.entries_only_on_right, after, created); // New keyspaces with types

-    for (auto&& key : diff.entries_differing) {
+    for (auto&& keyspace : diff.entries_differing) {
        // The user types of this keyspace differ, so diff the current types with the updated ones
-        auto current_types = proxy.local().get_db().local().find_keyspace(key).metadata()->user_types()->get_all_types();
+        auto current_types = proxy.local().get_db().local().find_keyspace(keyspace).metadata()->user_types()->get_all_types();
        decltype(current_types) updated_types;
-        auto ts = create_types_from_schema_partition(schema_result_value_type{key, std::move(after[key])});
+        auto ts = create_types_from_schema_partition(schema_result_value_type{keyspace, std::move(after[keyspace])});
        updated_types.reserve(ts.size());
        for (auto&& type : ts) {
            updated_types[type->_name] = std::move(type);
@@ -1027,36 +1043,46 @@ static void merge_types(distributed<service::storage_proxy>& proxy, schema_resul

        auto delta = difference(current_types, updated_types, indirect_equal_to<user_type>());

-        for (auto&& key : delta.entries_only_on_left) {
-            dropped.emplace_back(current_types[key]);
+        for (auto&& type_name : delta.entries_only_on_left) {
+            dropped.emplace_back(naked_user_type{keyspace, current_types[type_name]->name()});
        }
-        for (auto&& key : delta.entries_only_on_right) {
-            created.emplace_back(std::move(updated_types[key]));
+        for (auto&& type_name : delta.entries_only_on_right) {
+            created.emplace_back(naked_user_type{keyspace, updated_types[type_name]->name()});
        }
-        for (auto&& key : delta.entries_differing) {
-            altered.emplace_back(std::move(updated_types[key]));
+        for (auto&& type_name : delta.entries_differing) {
+            altered.emplace_back(naked_user_type{keyspace, updated_types[type_name]->name()});
        }
    }

-    proxy.local().get_db().invoke_on_all([&created, &dropped, &altered] (database& db) {
+    // Create and update user types before any tables/views are created that potentially
+    // use those types. Similarly, defer dropping until after tables/views that may use
+    // some of these user types are dropped.
+
+    proxy.local().get_db().invoke_on_all([&created, &altered] (database& db) {
        return seastar::async([&] {
            for (auto&& type : created) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type.qualified_name));
                db.find_keyspace(user_type->_keyspace).add_user_type(user_type);
                service::get_local_migration_manager().notify_create_user_type(user_type).get();
            }
-            for (auto&& type : dropped) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
-                db.find_keyspace(user_type->_keyspace).remove_user_type(user_type);
-                service::get_local_migration_manager().notify_drop_user_type(user_type).get();
-            }
            for (auto&& type : altered) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type.qualified_name));
                db.find_keyspace(user_type->_keyspace).add_user_type(user_type);
                service::get_local_migration_manager().notify_update_user_type(user_type).get();
            }
        });
    }).get();
+
+    return user_types_to_drop{[&proxy, dropped = std::move(dropped)] {
+        proxy.local().get_db().invoke_on_all([dropped = std::move(dropped)](database& db) {
+            return do_for_each(dropped, [&db](auto& user_type_to_drop) {
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(
+                        parse_type(std::move(user_type_to_drop.qualified_name)));
+                db.find_keyspace(user_type->_keyspace).remove_user_type(user_type);
+                return service::get_local_migration_manager().notify_drop_user_type(user_type);
+            });
+        }).get();
+    }};
 }

 #if 0
@@ -2209,13 +2235,14 @@ static future<view_ptr> create_view_from_table_row(distributed<service::storage_
 */
 future<std::vector<view_ptr>> create_views_from_schema_partition(distributed<service::storage_proxy>& proxy, const schema_result::mapped_type& result)
 {
-    auto views = make_lw_shared<std::vector<view_ptr>>();
-    return parallel_for_each(result->rows().begin(), result->rows().end(), [&proxy, views = std::move(views)] (auto&& row) {
-        return create_view_from_table_row(proxy, row).then([views] (auto&& v) {
-            views->push_back(std::move(v));
+    return do_with(std::vector<view_ptr>(), [&] (auto& views) {
+        return parallel_for_each(result->rows().begin(), result->rows().end(), [&proxy, &views] (auto&& row) {
+            return create_view_from_table_row(proxy, row).then([&views] (auto&& v) {
+                views.push_back(std::move(v));
+            });
+        }).then([&views] {
+            return std::move(views);
        });
-    }).then([views] {
-        return std::move(*views);
    });
 }

--- a/db/size_estimates_virtual_reader.hh
+++ b/db/size_estimates_virtual_reader.hh
@@ -42,43 +42,87 @@ namespace db {

 namespace size_estimates {

-class size_estimates_mutation_reader final : public mutation_reader::impl {
+class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
    struct token_range {
        bytes start;
        bytes end;
    };
    schema_ptr _schema;
-    const dht::partition_range& _prange;
+    const dht::partition_range* _prange;
    const query::partition_slice& _slice;
    using ks_range = std::vector<sstring>;
    stdx::optional<ks_range> _keyspaces;
    ks_range::const_iterator _current_partition;
    streamed_mutation::forwarding _fwd;
+    flat_mutation_reader_opt _partition_reader;
 public:
    size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
-            : _schema(schema)
-            , _prange(prange)
+            : impl(schema)
+            , _schema(std::move(schema))
+            , _prange(&prange)
            , _slice(slice)
            , _fwd(fwd)
    { }

-    virtual future<streamed_mutation_opt> operator()() override {
+private:
+    future<> get_next_partition() {
        // For each specified range, estimate (crudely) mean partition size and partitions count.
        auto& db = service::get_local_storage_proxy().get_db().local();
        if (!_keyspaces) {
-            _keyspaces = get_keyspaces(*_schema, db, _prange);
+            _keyspaces = get_keyspaces(*_schema, db, *_prange);
            _current_partition = _keyspaces->begin();
        }
        if (_current_partition == _keyspaces->end()) {
-            return make_ready_future<streamed_mutation_opt>();
+            _end_of_stream = true;
+            return make_ready_future<>();
        }
        return get_local_ranges().then([&db, this] (auto&& ranges) {
            auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
            auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
            ++_current_partition;
-            return streamed_mutation_opt(streamed_mutation_from_mutation(std::move(mutations), _fwd));
+            std::vector<mutation> ms;
+            ms.emplace_back(std::move(mutations));
+            _partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
        });
    }
+public:
+    virtual future<> fill_buffer() override {
+        return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
+            if (!_partition_reader) {
+                return get_next_partition();
+            }
+            return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
+                push_mutation_fragment(std::move(mf));
+                return stop_iteration(is_buffer_full());
+            }).then([this] {
+                if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
+                    _partition_reader = stdx::nullopt;
+                }
+            });
+        });
+    }
+    virtual void next_partition() override {
+        clear_buffer_to_next_partition();
+        if (is_buffer_empty()) {
+            _partition_reader = stdx::nullopt;
+        }
+    }
+    virtual future<> fast_forward_to(const dht::partition_range& pr) override {
+        clear_buffer();
+        _prange = &pr;
+        _keyspaces = stdx::nullopt;
+        _partition_reader = stdx::nullopt;
+        _end_of_stream = false;
+        return make_ready_future<>();
+    }
+    virtual future<> fast_forward_to(position_range pr) override {
+        forward_buffer_to(pr.start());
+        _end_of_stream = false;
+        if (_partition_reader) {
+            return _partition_reader->fast_forward_to(std::move(pr));
+        }
+        return make_ready_future<>();
+    }
    /**
     * Returns the primary ranges for the local node.
     * Used for testing as well.
@@ -270,14 +314,14 @@ private:
 };

 struct virtual_reader {
-    mutation_reader operator()(schema_ptr schema,
+    flat_mutation_reader operator()(schema_ptr schema,
            const dht::partition_range& range,
            const query::partition_slice& slice,
            const io_priority_class& pc,
            tracing::trace_state_ptr trace_state,
            streamed_mutation::forwarding fwd,
            mutation_reader::forwarding fwd_mr) {
-        return make_mutation_reader<size_estimates_mutation_reader>(schema, range, slice, fwd);
+        return make_flat_mutation_reader<size_estimates_mutation_reader>(schema, range, slice, fwd);
    }
 };

--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1577,10 +1577,7 @@ void make(database& db, bool durable, bool volatile_testing_only) {
            kscfg.enable_commitlog = !volatile_testing_only;
            kscfg.enable_cache = true;
            // don't make system keyspace reads wait for user reads
-            kscfg.read_concurrency_config.resources_sem = &db.system_keyspace_read_concurrency_sem();
-            kscfg.read_concurrency_config.active_reads = &db.get_stats().active_reads_system_keyspace;
-            kscfg.read_concurrency_config.timeout = {};
-            kscfg.read_concurrency_config.max_queue_length = std::numeric_limits<size_t>::max();
+            kscfg.read_concurrency_semaphore = &db.system_keyspace_read_concurrency_sem();
            // don't make system keyspace writes wait for user writes (if under pressure)
            kscfg.dirty_memory_manager = &db._system_dirty_memory_manager;
            keyspace _ks{ksm, std::move(kscfg)};
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -175,6 +175,31 @@ static bool update_requires_read_before_write(const schema& base,
    return false;
 }

+static bool is_partition_key_empty(
+        const schema& base,
+        const schema& view_schema,
+        const partition_key& base_key,
+        const clustering_row& update) {
+    // Empty partition keys are not supported on normal tables - they cannot
+    // be inserted or queried, so enforce those rules here.
+    if (view_schema.partition_key_columns().size() > 1) {
+        // Composite partition keys are different: all components
+        // are then allowed to be empty.
+        return false;
+    }
+    auto* base_col = base.get_column_definition(view_schema.partition_key_columns().front().name());
+    switch (base_col->kind) {
+    case column_kind::partition_key:
+        return base_key.get_component(base, base_col->position()).empty();
+    case column_kind::clustering_key:
+        return update.key().get_component(base, base_col->position()).empty();
+    default:
+        // No multi-cell columns in the view's partition key
+        auto& c = update.cells().cell_at(base_col->id);
+        return c.as_atomic_cell().value().empty();
+    }
+}
+
 bool matches_view_filter(const schema& base, const view_info& view, const partition_key& key, const clustering_row& update, gc_clock::time_point now) {
    return clustering_prefix_matches(base, view, key, update.key())
            && boost::algorithm::all_of(
@@ -330,7 +355,7 @@ static void add_cells_to_view(const schema& base, const schema& view, const row&
 * This method checks that the base row does match the view filter before applying anything.
 */
 void view_updates::create_entry(const partition_key& base_key, const clustering_row& update, gc_clock::time_point now) {
-    if (!matches_view_filter(*_base, _view_info, base_key, update, now)) {
+    if (is_partition_key_empty(*_base, *_view, base_key, update) || !matches_view_filter(*_base, _view_info, base_key, update, now)) {
        return;
    }
    deletable_row& r = get_view_row(base_key, update);
@@ -346,7 +371,7 @@ void view_updates::create_entry(const partition_key& base_key, const clustering_
 void view_updates::delete_old_entry(const partition_key& base_key, const clustering_row& existing, const row_tombstone& t, gc_clock::time_point now) {
    // Before deleting an old entry, make sure it was matching the view filter
    // (otherwise there is nothing to delete)
-    if (matches_view_filter(*_base, _view_info, base_key, existing, now)) {
+    if (!is_partition_key_empty(*_base, *_view, base_key, existing) && matches_view_filter(*_base, _view_info, base_key, existing, now)) {
        do_delete_old_entry(base_key, existing, t, now);
    }
 }
@@ -391,11 +416,11 @@ void view_updates::do_delete_old_entry(const partition_key& base_key, const clus
 void view_updates::update_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now) {
    // While we know update and existing correspond to the same view entry,
    // they may not match the view filter.
-    if (!matches_view_filter(*_base, _view_info, base_key, existing, now)) {
+    if (is_partition_key_empty(*_base, *_view, base_key, existing) || !matches_view_filter(*_base, _view_info, base_key, existing, now)) {
        create_entry(base_key, update, now);
        return;
    }
-    if (!matches_view_filter(*_base, _view_info, base_key, update, now)) {
+    if (is_partition_key_empty(*_base, *_view, base_key, update) || !matches_view_filter(*_base, _view_info, base_key, update, now)) {
        do_delete_old_entry(base_key, existing, row_tombstone(), now);
        return;
    }
@@ -636,8 +661,10 @@ future<stop_iteration> view_update_builder::on_results() {
    }

    // If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
-    if (_update && _update->is_clustering_row()) {
-        generate_update(std::move(*_update).as_clustering_row(), { });
+    if (_update) {
+        if (_update->is_clustering_row()) {
+            generate_update(std::move(*_update).as_clustering_row(), { });
+        }
        return advance_updates();
    }

--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -399,6 +399,14 @@ public:
        return { maximum_token(), token_bound::end };
    }

+    bool is_min() const {
+        return _token.is_minimum();
+    }
+
+    bool is_max() const {
+        return _token.is_maximum();
+    }
+
    static ring_position starting_at(dht::token token) {
        return { std::move(token), token_bound::start };
    }
@@ -559,6 +567,12 @@ public:
        , _weight(weight)
    { }

+    explicit ring_position_view(const dht::token& token, int8_t weight = -1)
+        : _token(&token)
+        , _key(nullptr)
+        , _weight(weight)
+    { }
+
    const partition_key* key() const { return _key; }

    friend std::ostream& operator<<(std::ostream&, ring_position_view);
--- a/dht/range_streamer.cc
+++ b/dht/range_streamer.cc
@@ -300,6 +300,7 @@ future<> range_streamer::do_stream_async() {
                unsigned sp_index = 0;
                unsigned nr_ranges_streamed = 0;
                size_t nr_ranges_total = range_vec.size();
+                size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
                dht::token_range_vector ranges_to_stream;
                auto do_streaming = [&] {
                    auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++));
@@ -318,7 +319,7 @@ future<> range_streamer::do_stream_async() {
                        ranges_to_stream.push_back(*it);
                        it = range_vec.erase(it);
                        nr_ranges_streamed++;
-                        if (ranges_to_stream.size() < _nr_ranges_per_stream_plan) {
+                        if (ranges_to_stream.size() < nr_ranges_per_stream_plan) {
                            continue;
                        } else {
                            do_streaming();
--- a/dht/range_streamer.hh
+++ b/dht/range_streamer.hh
@@ -174,8 +174,6 @@ private:
    std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
    stream_plan _stream_plan;
    std::unordered_map<sstring, std::vector<sstring>> _column_families;
-    // Number of ranges to stream per stream plan
-    unsigned _nr_ranges_per_stream_plan = 10;
    // Retry the stream plan _nr_max_retry times
    unsigned _nr_retried = 0;
    unsigned _nr_max_retry = 5;
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -43,7 +43,7 @@ done
 . /etc/os-release
 case "$ID" in
    "centos")
-        AMI=ami-46bf8a51
+        AMI=ami-ae7bfdb8
        REGION=us-east-1
        SSH_USERNAME=centos
        ;;
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/modprobe.d/scylla-raid0.conf
+++ b/dist/common/modprobe.d/scylla-raid0.conf
@@ -1 +0,0 @@
-options raid0 devices_discard_performance=Y
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -323,10 +323,21 @@ fi

 CUR_VERSION=`scylla --version` || true
 if [ "$CUR_VERSION" != "" ]; then
-    NEW_VERSION=`sudo -u scylla /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid version --version $CUR_VERSION --mode i` || true
+    if is_debian_variant; then
+        NEW_VERSION=`sudo -u scylla /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/apt/sources.list.d/scylla*.list' version --version $CUR_VERSION --mode i` || true
+    else
+        NEW_VERSION=`sudo -u scylla /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/yum.repos.d/scylla*.repo' version --version $CUR_VERSION --mode i` || true
+    fi
    if [ "$NEW_VERSION" != "" ]; then
       echo $NEW_VERSION
    fi
+else
+    if is_debian_variant; then
+        NEW_VERSION=`sudo -u scylla /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/apt/sources.list.d/scylla*.list' version --version unknown --mode u` || true
+    else
+        NEW_VERSION=`sudo -u scylla /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/yum.repos.d/scylla*.repo' version --version unknown --mode u` || true
+    fi
+    echo "Scylla executable is not found, check your installation" $NEW_VERSION
 fi

 # scylla_selinux_setup only supports Red Hat variants
--- a/dist/common/systemd/scylla-housekeeping-daily.service.in
+++ b/dist/common/systemd/scylla-housekeeping-daily.service.in
@@ -6,7 +6,7 @@ After=network.target
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/yum.repos.d/scylla*.repo' -q -c /etc/scylla.d/housekeeping.cfg version --mode d
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode d

 [Install]
 WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-housekeeping-restart.service.in
+++ b/dist/common/systemd/scylla-housekeeping-restart.service.in
@@ -6,7 +6,7 @@ After=network.target
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q --repo-files '/etc/yum.repos.d/scylla*.repo' -c /etc/scylla.d/housekeeping.cfg version --mode r
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode r

 [Install]
 WantedBy=multi-user.target
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -2,11 +2,11 @@

 . /etc/os-release
 print_usage() {
-    echo "build_deb.sh -target <codename> --dist --rebuild-dep"
+    echo "build_deb.sh -target <codename> --dist --rebuild-dep --jobs 2"
    echo "  --target target distribution codename"
    echo "  --dist  create a public distribution package"
-    echo "  --rebuild-dep  rebuild dependency packages"
    echo "  --no-clean  don't rebuild pbuilder tgz"
+    echo "  --jobs  specify number of jobs"
    exit 1
 }
 install_deps() {
@@ -17,16 +17,12 @@ install_deps() {
    sudo dpkg -P ${DEB_FILE%%_*.deb}
 }

-REBUILD=0
 DIST=0
 TARGET=
 NO_CLEAN=0
+JOBS=0
 while [ $# -gt 0 ]; do
    case "$1" in
-        "--rebuild-dep")
-            REBUILD=1
-            shift 1
-            ;;
        "--dist")
            DIST=1
            shift 1
@@ -39,6 +35,10 @@ while [ $# -gt 0 ]; do
            NO_CLEAN=1
            shift 1
            ;;
+        "--jobs")
+            JOBS=$2
+            shift 2
+            ;;
        *)
            print_usage
            ;;
@@ -111,11 +111,6 @@ if [ -z "$TARGET" ]; then
        exit 1
    fi
 fi
-if [ $REBUILD -eq 1 ] && [ "$TARGET" != "$CODENAME" ]; then
-    echo "Rebuild dependencies doesn't support cross-build."
-    echo "Please run it on following distribution: $TARGET"
-    exit 1
-fi

 VERSION=$(./SCYLLA-VERSION-GEN)
 SCYLLA_VERSION=$(cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/')
@@ -138,8 +133,10 @@ if [ "$TARGET" = "jessie" ]; then
    cp dist/debian/scylla-server.cron.d debian/
    sed -i -e "s/@@REVISION@@/1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
-    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
-    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++-5, libunwind-dev, antlr3, libthrift-dev, antlr3-c++-dev/g" debian/control
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping-daily/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@/dh_installinit --no-start --name scylla-housekeeping-restart/g" debian/rules
+    sed -i -e "s#@@COMPILER@@#/opt/scylladb/bin/g++-7#g" debian/rules
+    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, scylla-gcc72-g++-7, libunwind-dev, scylla-antlr35, scylla-libthrift010-dev, scylla-antlr35-c++-dev, scylla-libboost-program-options163-dev, scylla-libboost-filesystem163-dev, scylla-libboost-system163-dev, scylla-libboost-thread163-dev, scylla-libboost-test163-dev/g" debian/control
    sed -i -e "s/@@DEPENDS@@//g" debian/control
    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
    sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
@@ -148,16 +145,19 @@ if [ "$TARGET" = "jessie" ]; then
    sed -i -e "s#@@SYSCTL@@#dist/debian/sysctl.d/99-scylla.conf etc/sysctl.d#g" debian/scylla-server.install
    sed -i -e "s#@@SCRIPTS_SAVE_COREDUMP@@#dist/debian/scripts/scylla_save_coredump usr/lib/scylla#g" debian/scylla-server.install
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
-elif [ "$TARGET" = "stretch" ] || [ "$TARGET" = "buster" ] || [ "$TARGET" = "sid" ]; then
+elif [ "$TARGET" = "stretch" ]; then
    cp dist/debian/scylla-server.cron.d debian/
    sed -i -e "s/@@REVISION@@/1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
-    sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
-    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind8-dev, antlr3, libthrift-dev, antlr3-c++-dev/g" debian/control
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping-daily/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@/dh_installinit --no-start --name scylla-housekeeping-restart/g" debian/rules
+    sed -i -e "s#@@COMPILER@@#/opt/scylladb/bin/g++-7#g" debian/rules
+    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, scylla-gcc72-g++-7, libunwind-dev, antlr3, scylla-libthrift010-dev, scylla-antlr35-c++-dev, libboost-program-options1.62-dev, libboost-filesystem1.62-dev, libboost-system1.62-dev, libboost-thread1.62-dev, libboost-test1.62-dev/g" debian/control
    sed -i -e "s/@@DEPENDS@@//g" debian/control
    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
    sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
    sed -i -e "s#@@HKDOTTIMER_R@@#dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@FTDOTTIMER@@#dist/common/systemd/scylla-fstrim.timer /lib/systemd/system#g" debian/scylla-server.install
    sed -i -e "s#@@SYSCTL@@#dist/debian/sysctl.d/99-scylla.conf etc/sysctl.d#g" debian/scylla-server.install
    sed -i -e "s#@@SCRIPTS_SAVE_COREDUMP@@#dist/debian/scripts/scylla_save_coredump usr/lib/scylla#g" debian/scylla-server.install
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
@@ -165,8 +165,10 @@ elif [ "$TARGET" = "trusty" ]; then
    cp dist/debian/scylla-server.cron.d debian/
    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@/--upstart-only/g" debian/rules
-    sed -i -e "s/@@COMPILER@@/g++-7/g" debian/rules
-    sed -i -e "s/@@BUILD_DEPENDS@@/g++-7, libunwind8-dev, scylla-antlr35, scylla-libthrift010-dev, scylla-antlr35-c++-dev/g" debian/control
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping --upstart-only/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@//g" debian/rules
+    sed -i -e "s#@@COMPILER@@#/opt/scylladb/bin/g++-7#g" debian/rules
+    sed -i -e "s/@@BUILD_DEPENDS@@/scylla-gcc72-g++-7, libunwind8-dev, scylla-antlr35, scylla-libthrift010-dev, scylla-antlr35-c++-dev, scylla-libboost-program-options163-dev, scylla-libboost-filesystem163-dev, scylla-libboost-system163-dev, scylla-libboost-thread163-dev, scylla-libboost-test163-dev/g" debian/control
    sed -i -e "s/@@DEPENDS@@/hugepages, num-utils/g" debian/control
    sed -i -e "s#@@INSTALL@@#dist/debian/sudoers.d/scylla etc/sudoers.d#g" debian/scylla-server.install
    sed -i -e "s#@@HKDOTTIMER_D@@##g" debian/scylla-server.install
@@ -175,11 +177,43 @@ elif [ "$TARGET" = "trusty" ]; then
    sed -i -e "s#@@SYSCTL@@#dist/debian/sysctl.d/99-scylla.conf etc/sysctl.d#g" debian/scylla-server.install
    sed -i -e "s#@@SCRIPTS_SAVE_COREDUMP@@#dist/debian/scripts/scylla_save_coredump usr/lib/scylla#g" debian/scylla-server.install
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
-elif [ "$TARGET" = "xenial" ] || [ "$TARGET" = "yakkety" ] || [ "$TARGET" = "zesty" ] || [ "$TARGET" = "artful" ]; then
+elif [ "$TARGET" = "xenial" ]; then
    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping-daily/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@/dh_installinit --no-start --name scylla-housekeeping-restart/g" debian/rules
+    sed -i -e "s#@@COMPILER@@#/opt/scylladb/bin/g++-7#g" debian/rules
+    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, scylla-gcc72-g++-7, libunwind-dev, antlr3, scylla-libthrift010-dev, scylla-antlr35-c++-dev, scylla-libboost-program-options163-dev, scylla-libboost-filesystem163-dev, scylla-libboost-system163-dev, scylla-libboost-thread163-dev, scylla-libboost-test163-dev/g" debian/control
+    sed -i -e "s/@@DEPENDS@@/hugepages, /g" debian/control
+    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_R@@#dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@FTDOTTIMER@@#dist/common/systemd/scylla-fstrim.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@SYSCTL@@##g" debian/scylla-server.install
+    sed -i -e "s#@@SCRIPTS_SAVE_COREDUMP@@##g" debian/scylla-server.install
+    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@##g" debian/scylla-server.install
+elif [ "$TARGET" = "bionic" ]; then
+    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
+    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping-daily/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@/dh_installinit --no-start --name scylla-housekeeping-restart/g" debian/rules
+    sed -i -e "s#@@COMPILER@@#g++-7#g" debian/rules
+    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind-dev, antlr3, scylla-libthrift010-dev, scylla-antlr35-c++-dev, libboost-program-options-dev, libboost-filesystem-dev, libboost-system-dev, libboost-thread-dev, libboost-test-dev/g" debian/control
+    sed -i -e "s/@@DEPENDS@@/hugepages, /g" debian/control
+    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_R@@#dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@FTDOTTIMER@@#dist/common/systemd/scylla-fstrim.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@SYSCTL@@##g" debian/scylla-server.install
+    sed -i -e "s#@@SCRIPTS_SAVE_COREDUMP@@##g" debian/scylla-server.install
+    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@##g" debian/scylla-server.install
+elif [ "$TARGET" = "yakkety" ] || [ "$TARGET" = "zesty" ] || [ "$TARGET" = "artful" ]; then
+    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
+    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping-daily/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@/dh_installinit --no-start --name scylla-housekeeping-restart/g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++-7/g" debian/rules
-    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++-7, libunwind-dev, antlr3, scylla-libthrift010-dev, scylla-antlr35-c++-dev/g" debian/control
+    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++-7, libunwind-dev, antlr3, scylla-libthrift010-dev, scylla-antlr35-c++-dev, libboost-program-options-dev, libboost-filesystem-dev, libboost-system-dev, libboost-thread-dev, libboost-test-dev/g" debian/control
    sed -i -e "s/@@DEPENDS@@/hugepages, /g" debian/control
    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
    sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
@@ -198,23 +232,26 @@ else
 fi
 cp dist/common/systemd/scylla-server.service.in debian/scylla-server.service
 sed -i -e "s#@@SYSCONFDIR@@#/etc/default#g" debian/scylla-server.service
-cp dist/common/systemd/scylla-housekeeping-daily.service debian/scylla-server.scylla-housekeeping-daily.service
-cp dist/common/systemd/scylla-housekeeping-restart.service debian/scylla-server.scylla-housekeeping-restart.service
+cp dist/common/systemd/scylla-housekeeping-daily.service.in debian/scylla-server.scylla-housekeeping-daily.service
+sed -i -e "s#@@REPOFILES@@#'/etc/apt/sources.list.d/scylla*.list'#g" debian/scylla-server.scylla-housekeeping-daily.service
+cp dist/common/systemd/scylla-housekeeping-restart.service.in debian/scylla-server.scylla-housekeeping-restart.service
+sed -i -e "s#@@REPOFILES@@#'/etc/apt/sources.list.d/scylla*.list'#g" debian/scylla-server.scylla-housekeeping-restart.service
 cp dist/common/systemd/scylla-fstrim.service debian/scylla-server.scylla-fstrim.service
 cp dist/common/systemd/node-exporter.service debian/scylla-server.node-exporter.service

-if [ $REBUILD -eq 1 ]; then
-    ./dist/debian/dep/build_dependency.sh
-fi
-
-cp ./dist/debian/pbuilderrc ~/.pbuilderrc
+sudo cp ./dist/debian/pbuilderrc ~root/.pbuilderrc
 if [ $NO_CLEAN -eq 0 ]; then
    sudo rm -fv /var/cache/pbuilder/scylla-server-$TARGET.tgz
-    sudo -E DIST=$TARGET REBUILD=$REBUILD /usr/sbin/pbuilder clean
-    sudo -E DIST=$TARGET REBUILD=$REBUILD /usr/sbin/pbuilder create
+    sudo -H DIST=$TARGET /usr/sbin/pbuilder clean
+    sudo -H DIST=$TARGET /usr/sbin/pbuilder create --allow-untrusted
 fi
-sudo -E DIST=$TARGET REBUILD=$REBUILD /usr/sbin/pbuilder update
-if [ $REBUILD -eq 1 ]; then
-    sudo -E DIST=$TARGET REBUILD=$REBUILD /usr/sbin/pbuilder execute --save-after-exec dist/debian/dep/pbuilder_install_deps.sh
+if [ $JOBS -ne 0 ]; then
+    DEB_BUILD_OPTIONS="parallel=$JOBS"
 fi
-sudo -E DIST=$TARGET REBUILD=$REBUILD pdebuild --buildresult build/debs
+sudo -H DIST=$TARGET /usr/sbin/pbuilder update --allow-untrusted
+if [ "$TARGET" = "trusty" ] || [ "$TARGET" = "xenial" ] || [ "$TARGET" = "yakkety" ] || [ "$TARGET" = "zesty" ] || [ "$TARGET" = "artful" ] || [ "$TARGET" = "bionic" ]; then
+    sudo -H DIST=$TARGET /usr/sbin/pbuilder execute --save-after-exec dist/debian/ubuntu_enable_ppa.sh
+elif [ "$TARGET" = "jessie" ] || [ "$TARGET" = "stretch" ]; then
+    sudo -H DIST=$TARGET /usr/sbin/pbuilder execute --save-after-exec dist/debian/debian_install_gpgkey.sh
+fi
+sudo -H DIST=$TARGET DEB_BUILD_OPTIONS=$DEB_BUILD_OPTIONS pdebuild --buildresult build/debs
--- a/dist/debian/control.in
+++ b/dist/debian/control.in
@@ -5,7 +5,7 @@ Section: database
 Priority: optional
 X-Python3-Version: >= 3.4
 Standards-Version: 3.9.5
-Build-Depends: python3-setuptools (>= 0.6b3), python3-all, python3-all-dev, debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, thrift-compiler, ragel, ninja-build, git, scylla-libboost-program-options163-dev | libboost-program-options1.55-dev | libboost-program-options-dev, scylla-libboost-filesystem163-dev | libboost-filesystem1.55-dev | libboost-filesystem-dev, scylla-libboost-system163-dev | libboost-system1.55-dev | libboost-system-dev, scylla-libboost-thread163-dev | libboost-thread1.55-dev | libboost-thread-dev, scylla-libboost-test163-dev | libboost-test1.55-dev | libboost-test-dev, libgnutls28-dev, libhwloc-dev, libnuma-dev, libpciaccess-dev, xfslibs-dev, python3-pyparsing, libxml2-dev, libsctp-dev, python-urwid, pciutils, libprotobuf-dev, protobuf-compiler, systemtap-sdt-dev, cmake, libssl-dev, @@BUILD_DEPENDS@@
+Build-Depends: python3-setuptools, python3-all, python3-all-dev, debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, thrift-compiler, ragel, ninja-build, git, libgnutls28-dev, libhwloc-dev, libnuma-dev, libpciaccess-dev, xfslibs-dev, python3-pyparsing, libxml2-dev, libsctp-dev, python-urwid, pciutils, libprotobuf-dev, protobuf-compiler, systemtap-sdt-dev, cmake, libssl-dev, @@BUILD_DEPENDS@@

 Package: scylla-conf
 Architecture: any
@@ -41,7 +41,7 @@ Description: Scylla kernel tuning configuration
 Package: scylla
 Section: metapackages
 Architecture: any
-Depends: scylla-server, scylla-jmx, scylla-tools, scylla-kernel-conf
+Depends: scylla-server, scylla-jmx, scylla-tools, scylla-tools-core, scylla-kernel-conf
 Description: Scylla database metapackage
 Scylla is a highly scalable, eventually consistent, distributed,
 partitioned row DB.
--- a/dist/debian/debian_install_gpgkey.sh
+++ b/dist/debian/debian_install_gpgkey.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+. /etc/os-release
+
+if [ "$VERSION_ID" = "8" ]; then
+    apt-get -y install gnupg-curl ca-certificates
+    apt-key adv --fetch-keys https://download.opensuse.org/repositories/home:/scylladb:/scylla-3rdparty-jessie/Debian_8.0/Release.key
+elif [ "$VERSION_ID" = "9" ]; then
+    apt-get -y install dirmngr curl ca-certificates
+    curl -fsSL https://download.opensuse.org/repositories/home:/scylladb:/scylla-3rdparty-stretch/Debian_9.0/Release.key | apt-key add -
+else
+    echo "Unsupported distribution."
+    exit 1
+fi
+apt-get update
--- a/dist/debian/dep/antlr3-3.5.2/antlr3
+++ b/dist/debian/dep/antlr3-3.5.2/antlr3
@@ -1,3 +0,0 @@
-#!/bin/sh
-
-exec /usr/bin/java -jar /usr/share/java/antlr-3.5.2-complete-no-st3.jar $*
--- a/dist/debian/dep/antlr3-3.5.2/debian/changelog
+++ b/dist/debian/dep/antlr3-3.5.2/debian/changelog
@@ -1,5 +0,0 @@
-antlr3 (3.5.2-ubuntu1) trusty; urgency=medium
-
-  * Initial release.
-
- -- Takuya ASADA <syuu@scylladb.com>  Mon, 24 Aug 2015 09:22:55 +0000
--- a/dist/debian/dep/antlr3-3.5.2/debian/compat
+++ b/dist/debian/dep/antlr3-3.5.2/debian/compat
@@ -1 +0,0 @@
-9
--- a/dist/debian/dep/antlr3-3.5.2/debian/control
+++ b/dist/debian/dep/antlr3-3.5.2/debian/control
@@ -1,13 +0,0 @@
-Source: antlr3
-Maintainer: Takuya ASADA <syuu@scylladb.com>
-Section: misc
-Priority: optional
-Standards-Version: 3.5.2
-Build-Depends: debhelper (>= 9)
-
-Package: antlr3
-Architecture: all
-Depends: ${shlibs:Depends}, ${misc:Depends}, openjdk-7-jre-headless
-Replaces: antlr3-tool
-Description: language tool for constructing recognizers, compilers etc
- A language tool that provides a framework for constructing recognizers, interpreters, compilers, and translators from grammatical descriptions containing actions in a variety of target languages.
--- a/dist/debian/dep/antlr3-3.5.2/debian/copyright
+++ b/dist/debian/dep/antlr3-3.5.2/debian/copyright
@@ -1,16 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: ANTLR
-Upstream-Contact: http://www.antlr.org/
-Source: https://github.com/antlr/antlr3
-
-Files: antlr-3.5.2-complete-no-st3.jar
-Copyright: Copyright (c) 2013 Terence Parr
-License: BSD-3-clause
-
-Files: antlr3
-Copyright: Copyright (c) 2015 ScyllaDB
-License: AGPL-3.0
-
-Files: debian/*
-Copyright: Copyright (c) 2015 ScyllaDB
-License: AGPL-3.0
--- a/dist/debian/dep/antlr3-3.5.2/debian/rules
+++ b/dist/debian/dep/antlr3-3.5.2/debian/rules
@@ -1,12 +0,0 @@
-#!/usr/bin/make -f
-
-override_dh_auto_install:
-	mkdir -p $(CURDIR)/debian/antlr3/usr/share/java
-	cp $(CURDIR)/antlr-3.5.2-complete-no-st3.jar \
-		$(CURDIR)/debian/antlr3/usr/share/java
-
-	mkdir -p $(CURDIR)/debian/antlr3/usr/bin
-	cp $(CURDIR)/antlr3 \
-		$(CURDIR)/debian/antlr3/usr/bin
-%:
-	dh $@
--- a/dist/debian/dep/antlr3-c++-dev-3.5.2/debian/changelog
+++ b/dist/debian/dep/antlr3-c++-dev-3.5.2/debian/changelog
@@ -1,5 +0,0 @@
-antlr3-c++-dev (3.5.2-ubuntu1) trusty; urgency=medium
-
-  * Initial release.
-
- -- Takuya ASADA <syuu@scylladb.com>  Mon, 24 Aug 2015 09:22:55 +0000
--- a/dist/debian/dep/antlr3-c++-dev-3.5.2/debian/compat
+++ b/dist/debian/dep/antlr3-c++-dev-3.5.2/debian/compat
@@ -1 +0,0 @@
-9
--- a/dist/debian/dep/antlr3-c++-dev-3.5.2/debian/control
+++ b/dist/debian/dep/antlr3-c++-dev-3.5.2/debian/control
@@ -1,12 +0,0 @@
-Source: antlr3-c++-dev
-Maintainer: Takuya ASADA <syuu@scylladb.com>
-Section: misc
-Priority: optional
-Standards-Version: 3.5.2
-Build-Depends: debhelper (>= 9)
-
-Package: antlr3-c++-dev
-Architecture: all
-Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: language tool for constructing recognizers, compilers etc
- A language tool that provides a framework for constructing recognizers, interpreters, compilers, and translators from grammatical descriptions containing actions in a variety of target languages.
--- a/dist/debian/dep/antlr3-c++-dev-3.5.2/debian/copyright
+++ b/dist/debian/dep/antlr3-c++-dev-3.5.2/debian/copyright
@@ -1,12 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: ANTLR
-Upstream-Contact: http://www.antlr.org/
-Source: https://github.com/antlr/antlr3
-
-Files: *
-Copyright: Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
-License: BSD-3-clause
-
-Files: debian/*
-Copyright: Copyright (c) 2015 ScyllaDB
-License: AGPL-3.0
--- a/dist/debian/dep/antlr3-c++-dev-3.5.2/debian/rules
+++ b/dist/debian/dep/antlr3-c++-dev-3.5.2/debian/rules
@@ -1,8 +0,0 @@
-#!/usr/bin/make -f
-
-override_dh_auto_install:
-	mkdir -p $(CURDIR)/debian/antlr3-c++-dev/usr/include
-	cp $(CURDIR)/runtime/Cpp/include/* \
-		$(CURDIR)/debian/antlr3-c++-dev/usr/include
-%:
-	dh $@
--- a/dist/debian/dep/build_dependency.sh
+++ b/dist/debian/dep/build_dependency.sh
@@ -1,123 +0,0 @@
-#!/bin/bash -e
-
-. /etc/os-release
-install_deps() {
-    echo Y | sudo mk-build-deps
-    DEB_FILE=`ls *-build-deps*.deb`
-    sudo gdebi -n $DEB_FILE
-    sudo rm -f $DEB_FILE
-    sudo dpkg -P ${DEB_FILE%%_*.deb}
-}
-
-CODENAME=`lsb_release -c|awk '{print $2}'`
-
-# workaround fix for #2444
-if [ "$CODENAME" = "jessie" ]; then
-    if [ ! -e /etc/apt/sources.list.d/jessie-backports.list ]; then
-        sudo sh -c 'echo deb "http://httpredir.debian.org/debian jessie-backports main" > /etc/apt/sources.list.d/jessie-backports.list'
-    fi
-    sudo apt-get -y update
-    sudo apt-get install -t jessie-backports -y texlive
-fi
-
-if [ ! -f /usr/bin/gdebi ]; then
-    sudo apt-get install -y gdebi-core
-fi
-if [ ! -f /usr/bin/mk-build-deps ]; then
-    sudo apt-get install -y devscripts
-fi
-if [ ! -f /usr/bin/equivs ]; then
-    sudo apt-get install -y equivs
-fi
-
-if [ "$CODENAME" = "trusty" ] || [ "$CODENAME" = "jessie" ]; then
-    if [ ! -f build/antlr3_*.deb ]; then
-        rm -rf build/antlr3-3.5.2
-        mkdir -p build/antlr3-3.5.2
-        cp -a dist/debian/dep/antlr3-3.5.2/* build/antlr3-3.5.2
-        cd build/antlr3-3.5.2
-        wget -nv http://www.antlr3.org/download/antlr-3.5.2-complete-no-st3.jar
-        install_deps
-        debuild -r fakeroot --no-tgz-check -us -uc
-        cd -
-    fi
-    if [ ! -f build/scylla-env_*.deb ]; then
-        rm -rf build/scylla-env-1.0
-        cp -a dist/common/dep/scylla-env-1.0 build/
-        cd build/scylla-env-1.0
-        debuild -r fakeroot --no-tgz-check -us -uc
-        cd -
-    fi
-    if [ ! -f build/scylla-gdb_*.deb ]; then
-        rm -rf build/gdb-7.11
-        if [ ! -f build/gdb_7.11-0ubuntu1.dsc ]; then
-            wget -nv -O build/gdb_7.11-0ubuntu1.dsc http://archive.ubuntu.com/ubuntu/pool/main/g/gdb/gdb_7.11-0ubuntu1.dsc
-        fi
-        if [ ! -f build/gdb_7.11.orig.tar.xz ]; then
-            wget -nv -O build/gdb_7.11.orig.tar.xz http://archive.ubuntu.com/ubuntu/pool/main/g/gdb/gdb_7.11.orig.tar.xz
-        fi
-        if [ ! -f build/gdb_7.11-0ubuntu1.debian.tar.xz ]; then
-            wget -nv -O build/gdb_7.11-0ubuntu1.debian.tar.xz http://archive.ubuntu.com/ubuntu/pool/main/g/gdb/gdb_7.11-0ubuntu1.debian.tar.xz
-        fi
-        cd build
-        dpkg-source -x gdb_7.11-0ubuntu1.dsc
-        mv gdb_7.11.orig.tar.xz scylla-gdb_7.11.orig.tar.xz
-        cd -
-        cd build/gdb-7.11
-        patch -p0 < ../../dist/debian/dep/gdb.diff
-        install_deps
-        debuild -r fakeroot --no-tgz-check -us -uc
-        cd -
-    fi
-fi
-
-if [ ! -f build/antlr3-c++-dev_*.deb ]; then
-    rm -rf build/antlr3-c++-dev-3.5.2
-    if [ ! -f build/3.5.2.tar.gz ]; then
-        wget -nv -O build/3.5.2.tar.gz https://github.com/antlr/antlr3/archive/3.5.2.tar.gz
-    fi
-    cd build
-    tar xpf 3.5.2.tar.gz
-    mv antlr3-3.5.2 antlr3-c++-dev-3.5.2
-    cd -
-    cp -a dist/debian/dep/antlr3-c++-dev-3.5.2/debian build/antlr3-c++-dev-3.5.2
-    cd build/antlr3-c++-dev-3.5.2
-    install_deps
-    debuild -r fakeroot --no-tgz-check -us -uc
-    cd -
-fi
-
-if [ ! -f build/libthrift0_*.deb ]; then
-    rm -rf build/thrift-0.10.0
-    if [ ! -f build/thrift-0.10.0.tar.gz ]; then
-        wget -nv -O build/thrift-0.10.0.tar.gz http://archive.apache.org/dist/thrift/0.10.0/thrift-0.10.0.tar.gz
-    fi
-    cd build
-    tar xpf thrift-0.10.0.tar.gz
-    cd thrift-0.10.0
-    patch -p0 < ../../dist/debian/dep/thrift.diff
-    install_deps
-    debuild -r fakeroot --no-tgz-check -us -uc
-    cd ../..
-fi
-
-if [ "$CODENAME" = "jessie" ]; then
-    if [ ! -f build/gcc-5_*.deb ]; then
-        cd build
-        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1-5.dsc
-        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1.orig.tar.gz
-        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1-5.diff.gz
-        dpkg-source -x gcc-5_5.4.1-5.dsc
-        cd gcc-5-5.4.1
-        # resolve build time dependencies manually, since mk-build-deps doesn't works for gcc package
-        sudo apt-get install -y g++-multilib libc6-dev-i386 lib32gcc1 libc6-dev-x32 libx32gcc1 libc6-dbg m4 libtool autoconf2.64 autogen gawk zlib1g-dev systemtap-sdt-dev gperf bison flex gdb texinfo locales sharutils libantlr-java libffi-dev gnat-4.9 libisl-dev libmpc-dev libmpfr-dev libgmp-dev dejagnu realpath chrpath quilt doxygen graphviz ghostscript texlive-latex-base xsltproc libxml2-utils docbook-xsl-ns
-        patch -p0 < ../../dist/debian/dep/debian-gcc-5-jessie.diff
-        ./debian/rules control
-        debuild -r fakeroot -us -uc
-        cd ../..
-    fi
-fi
-
-rm -rf /var/tmp/pbuilder
-mkdir /var/tmp/pbuilder
-cp -v build/*.deb /var/tmp/pbuilder/
--- a/dist/debian/dep/debian-gcc-5-jessie.diff
+++ b/dist/debian/dep/debian-gcc-5-jessie.diff
@@ -1,245 +0,0 @@
-diff -Nur ../gcc-5-5.4.1.orig/debian/patches/unwind_dw2_fde_nolock.diff ./debian/patches/unwind_dw2_fde_nolock.diff
--- ../gcc-5-5.4.1.orig/debian/patches/unwind_dw2_fde_nolock.diff	1970-01-01 00:00:00.000000000 +0000
-+++ ./debian/patches/unwind_dw2_fde_nolock.diff	2017-08-09 00:23:51.095939513 +0000
-@@ -0,0 +1,95 @@
-+commit 2e452daf02a37ec310b2431375ceca569d0d6284
-+Author: jakub <jakub@138bc75d-0d04-0410-961f-82ee72b054a4>
-+Date:   Fri Sep 16 19:17:47 2016 +0000
-+
-+            PR libgcc/71744
-+            * unwind-dw2-fde.c (ATOMIC_FDE_FAST_PATH): Define if __register_frame*
-+            is not the primary registry and atomics are available.
-+            (any_objects_registered): New variable.
-+            (__register_frame_info_bases, __register_frame_info_table_bases):
-+            Atomically store 1 to any_objects_registered after registering first
-+            unwind info.
-+            (_Unwind_Find_FDE): Return early if any_objects_registered is 0.
-+    
-+    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@240193 138bc75d-0d04-0410-961f-82ee72b054a4
-+
-+diff --git a/src/libgcc/unwind-dw2-fde.c b/src/libgcc/unwind-dw2-fde.c
-+index 0bcf516..6ae2377 100644
-+--- a/src/libgcc/unwind-dw2-fde.c
-++++ b/src/libgcc/unwind-dw2-fde.c
-+@@ -35,6 +35,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-+ #include "unwind-pe.h"
-+ #include "unwind-dw2-fde.h"
-+ #include "gthr.h"
-++#else
-++#if (defined(__GTHREAD_MUTEX_INIT) || defined(__GTHREAD_MUTEX_INIT_FUNCTION)) \
-++    && defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4)
-++#define ATOMIC_FDE_FAST_PATH 1
-++#endif
-+ #endif
-+ 
-+ /* The unseen_objects list contains objects that have been registered
-+@@ -43,6 +48,9 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-+    by decreasing value of pc_begin.  */
-+ static struct object *unseen_objects;
-+ static struct object *seen_objects;
-++#ifdef ATOMIC_FDE_FAST_PATH
-++static int any_objects_registered;
-++#endif
-+ 
-+ #ifdef __GTHREAD_MUTEX_INIT
-+ static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT;
-+@@ -96,6 +104,16 @@ __register_frame_info_bases (const void *begin, struct object *ob,
-+ 
-+   ob->next = unseen_objects;
-+   unseen_objects = ob;
-++#ifdef ATOMIC_FDE_FAST_PATH
-++  /* Set flag that at least one library has registered FDEs.
-++     Use relaxed MO here, it is up to the app to ensure that the library
-++     loading/initialization happens-before using that library in other
-++     threads (in particular unwinding with that library's functions
-++     appearing in the backtraces).  Calling that library's functions
-++     without waiting for the library to initialize would be racy.  */
-++  if (!any_objects_registered)
-++    __atomic_store_n (&any_objects_registered, 1, __ATOMIC_RELAXED);
-++#endif
-+ 
-+   __gthread_mutex_unlock (&object_mutex);
-+ }
-+@@ -140,6 +158,16 @@ __register_frame_info_table_bases (void *begin, struct object *ob,
-+ 
-+   ob->next = unseen_objects;
-+   unseen_objects = ob;
-++#ifdef ATOMIC_FDE_FAST_PATH
-++  /* Set flag that at least one library has registered FDEs.
-++     Use relaxed MO here, it is up to the app to ensure that the library
-++     loading/initialization happens-before using that library in other
-++     threads (in particular unwinding with that library's functions
-++     appearing in the backtraces).  Calling that library's functions
-++     without waiting for the library to initialize would be racy.  */
-++  if (!any_objects_registered)
-++    __atomic_store_n (&any_objects_registered, 1, __ATOMIC_RELAXED);
-++#endif
-+ 
-+   __gthread_mutex_unlock (&object_mutex);
-+ }
-+@@ -1001,6 +1029,19 @@ _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
-+   struct object *ob;
-+   const fde *f = NULL;
-+ 
-++#ifdef ATOMIC_FDE_FAST_PATH
-++  /* For targets where unwind info is usually not registered through these
-++     APIs anymore, avoid taking a global lock.
-++     Use relaxed MO here, it is up to the app to ensure that the library
-++     loading/initialization happens-before using that library in other
-++     threads (in particular unwinding with that library's functions
-++     appearing in the backtraces).  Calling that library's functions
-++     without waiting for the library to initialize would be racy.  */
-++  if (__builtin_expect (!__atomic_load_n (&any_objects_registered,
-++					  __ATOMIC_RELAXED), 1))
-++    return NULL;
-++#endif
-++
-+   init_object_mutex_once ();
-+   __gthread_mutex_lock (&object_mutex);
-+ 
-diff -Nur ../gcc-5-5.4.1.orig/debian/rules.conf ./debian/rules.conf
--- ../gcc-5-5.4.1.orig/debian/rules.conf	2017-08-09 00:26:09.000000000 +0000
-+++ ./debian/rules.conf	2017-08-09 00:26:51.177254418 +0000
-@@ -206,7 +206,7 @@
-   ifneq (,$(filter $(distrelease),vivid))
-     BINUTILSBDV = 2.25-3~
-   else ifneq (,$(filter $(distrelease),jessie))
-    BINUTILSBDV = 2.25-7~
-+    BINUTILSBDV = 2.25-5~
-   else ifneq (,$(filter $(distrelease),sid stretch xenial))
-     BINUTILSBDV = 2.26.1
-   endif
-@@ -386,10 +386,10 @@
-   MPFR_BUILD_DEP = libmpfr-dev (>= 3.0.0-9~),
- endif
- 
-ISL_BUILD_DEP = libisl-dev,
-ifneq (,$(filter $(distrelease),jessie stretch sid experimental))
-  ISL_BUILD_DEP = libisl-dev (>= 0.14),
-endif
-+#ISL_BUILD_DEP = libisl-dev,
-+#ifneq (,$(filter $(distrelease),jessie stretch sid experimental))
-+#  ISL_BUILD_DEP = libisl-dev (>= 0.14),
-+#endif
- 
- ifneq (,$(filter $(distrelease),lenny etch squeeze wheezy dapper hardy jaunty karmic lucid maverick natty oneiric precise quantal raring))
-   MPC_BUILD_DEP = libmpc-dev,
-@@ -411,13 +411,6 @@
-   SDT_BUILD_DEP = systemtap-sdt-dev [linux-any kfreebsd-any hurd-any],
- endif
- 
-# ensure that the common libs, built from the next GCC version are available
-ifeq ($(PKGSOURCE),gcc-$(BASE_VERSION))
-  ifneq ($(with_common_libs),yes)
-    BASE_BUILD_DEP = gcc-6-base,
-  endif
-endif
-
- ifneq ($(DEB_CROSS),yes)
- # all archs for which to create b-d's
- any_archs = alpha amd64 armel armhf arm64 i386 mips mipsel mips64 mips64el powerpc ppc64 ppc64el m68k sh4 sparc64 s390x x32
-diff -Nur ../gcc-5-5.4.1.orig/debian/rules.defs ./debian/rules.defs
--- ../gcc-5-5.4.1.orig/debian/rules.defs	2017-08-09 00:26:09.000000000 +0000
-+++ ./debian/rules.defs	2017-08-09 00:26:51.177254418 +0000
-@@ -412,7 +412,7 @@
- # gcc versions (fixincludes, libgcj-common) ...
- #with_common_pkgs := yes
- # ... and some libraries, which do not change (libgcc1, libssp0).
-#with_common_libs := yes
-+with_common_libs := yes
- # XXX: should with_common_libs be "yes" only if this is the default compiler
- # version on the targeted arch?
- 
-@@ -560,7 +560,7 @@
- # C ---------------------------
- enabled_languages := c
- 
-with_jit = yes
-+with_jit = no
- 
- # FIXME: compiler bug
- jit_no_cpus	:= ia64
-@@ -660,6 +660,8 @@
- endif
- with_ada := $(call envfilt, ada, , , $(with_ada))
- 
-+with_ada := no
-+
- 
- ifeq ($(DEB_STAGE)-$(filter libgnat, $(with_rtlibs)),rtlibs-)
-   with_ada := disabled for rtlibs stage
-@@ -786,6 +788,8 @@
- endif
- with_java := $(call envfilt, java, , c++, $(with_java))
- 
-+with_java := no
-+
- ifeq ($(DEB_STAGE)-$(filter libgcj, $(with_rtlibs)),rtlibs-)
-   with_java := disabled for rtlibs stage
- endif
-@@ -915,6 +919,8 @@
- endif
- with_go := $(call envfilt, go, , , $(with_go))
- 
-+with_go := no
-+
- # Build all packages needed for Go development
- ifneq (,$(findstring gcc, $(PKGSOURCE)))
-   ifeq ($(with_go),yes)
-@@ -961,6 +967,8 @@
- endif
- with_d := $(call envfilt, d, , , $(with_d))
- 
-+with_d := no
-+
- ifeq ($(with_base_only),yes)
-   with_d := no
- endif
-@@ -1016,6 +1024,8 @@
- 
- with_fortran := $(call envfilt, fortran, , , $(with_fortran))
- 
-+with_fortran := no
-+
- # Build all packages needed for Fortran development
- ifeq ($(with_fortran),yes)
-   ifeq ($(with_dev),yes)
-@@ -1063,6 +1073,8 @@
- endif
- with_objc := $(call envfilt, objc, obj-c++, , $(with_objc))
- 
-+with_objc := no
-+
- ifeq ($(with_objc),yes)
-   # the ObjC runtime with garbage collection enabled needs the Boehm GC
-   with_objc_gc := yes
-@@ -1103,6 +1115,8 @@
- endif
- with_objcxx := $(call envfilt, obj-c++, , c++ objc, $(with_objcxx))
- 
-+with_objcxx := no
-+
- ifeq ($(with_objcxx),yes)
-   enabled_languages += obj-c++
- endif
-@@ -1480,6 +1494,9 @@
-   with_check := disabled for D
- endif
- with_check := $(call envfilt, check, , , $(with_check))
-+
-+with_check := no
-+
- ifdef WITHOUT_CHECK
-   with_check := disabled by environment
- endif
-diff -Nur ../gcc-5-5.4.1.orig/debian/rules.patch ./debian/rules.patch
--- ../gcc-5-5.4.1.orig/debian/rules.patch	2017-08-09 00:26:09.000000000 +0000
-+++ ./debian/rules.patch	2017-08-09 00:24:35.795274920 +0000
-@@ -113,6 +113,7 @@
- 	libjava-mips64el \
- 	libffi-pax \
- 	libffi-race-condition \
-+	unwind_dw2_fde_nolock \
- 
- # this is still needed on powerpc, e.g. firefox and insighttoolkit4 will ftbfs.
- ifneq (,$(filter $(DEB_TARGET_ARCH),powerpc))
--- a/dist/debian/dep/gdb.diff
+++ b/dist/debian/dep/gdb.diff
--- a/dist/debian/dep/pbuilder_install_deps.sh
+++ b/dist/debian/dep/pbuilder_install_deps.sh
@@ -1,38 +0,0 @@
-#!/bin/bash -e
-
-sudo apt update
-if [ ! -f /usr/bin/gdebi ]; then
-    sudo apt install -y gdebi-core
-fi
-if [ ! -f /usr/bin/lsb_release ]; then
-    sudo apt install -y lsb-release
-fi
-
-CODENAME=`lsb_release -c|awk '{print $2}'`
-
-sudo gdebi -n /var/tmp/pbuilder/antlr3-c++-dev_*.deb
-sudo gdebi -n /var/tmp/pbuilder/libthrift0_*.deb
-sudo gdebi -n /var/tmp/pbuilder/libthrift-dev_*.deb
-if [ "$CODENAME" = "trusty" ] || [ "$CODENAME" = "jessie" ]; then
-    sudo gdebi -n /var/tmp/pbuilder/antlr3_*.deb
-fi
-if [ "$CODENAME" = "jessie" ]; then
-    sudo gdebi -n /var/tmp/pbuilder/gcc-5-base_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libatomic1_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libcilkrts5_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libgcc1_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libgomp1_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libitm1_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/liblsan0_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libstdc++6_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libtsan0_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libubsan0_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libasan2_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libcc1-0_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libmpx0_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libgcc-5-dev_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/libstdc++-5-dev_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/cpp-5_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/gcc-5_*.deb
-    sudo gdebi -n /var/tmp/pbuilder/g++-5_*.deb
-fi
--- a/dist/debian/dep/thrift.diff
+++ b/dist/debian/dep/thrift.diff
@@ -1,344 +0,0 @@
-diff -Nur debian/changelog ../thrift-0.10.0.new/debian/changelog
--- debian/changelog	2016-12-19 20:05:45.000000000 +0000
-+++ ../thrift-0.10.0.new/debian/changelog	2017-06-20 19:14:33.902186804 +0000
-@@ -1,3 +1,8 @@
-+thrift (0.10.0-1) stable; urgency=low
-+  * update to 0.10.0
-+
-+ -- Takuya ASADA <syuu@scylladb.com>  Mon, 19 Jun 2017 23:08:43 +0000
-+
- thrift (0.10.0) stable; urgency=low
- 
-   * update to 0.10.0
-diff -Nur debian/control ../thrift-0.10.0.new/debian/control
--- debian/control	2016-12-21 03:04:19.000000000 +0000
-+++ ../thrift-0.10.0.new/debian/control	2017-06-20 19:14:33.902186804 +0000
-@@ -1,14 +1,10 @@
- Source: thrift
- Section: devel
- Priority: extra
-Build-Depends: debhelper (>= 9), build-essential, mono-mcs, python-dev, ant,
-    mono-devel,  libmono-system-web4.0-cil, erlang-base, ruby-dev | ruby1.9.1-dev, ruby-bundler ,autoconf, automake,
-+Build-Depends: debhelper (>= 9), build-essential, autoconf, automake,
-     pkg-config, libtool, bison, flex, libboost-dev | libboost1.53-dev,
-    python-all, python-setuptools, python-all-dev, python-all-dbg,
-    python3-all, python3-setuptools, python3-all-dev, python3-all-dbg,
-    openjdk-7-jdk | openjdk-8-jdk | default-jdk,
-    libboost-test-dev | libboost-test1.53-dev, libevent-dev, libssl-dev, perl (>= 5.8.0-7),
-    php5 | php7.0, php5-dev | php7.0-dev, libglib2.0-dev, qtchooser, qtbase5-dev-tools
-+    libboost-test-dev | libboost-test1.53-dev, libevent-dev, libssl-dev, 
-+    libglib2.0-dev, qtchooser, qtbase5-dev-tools
- Maintainer: Thrift Developer's <dev@thrift.apache.org>
- Homepage: http://thrift.apache.org/
- Vcs-Git: https://git-wip-us.apache.org/repos/asf/thrift.git
-@@ -17,144 +13,6 @@
- X-Python-Version: >= 2.6
- X-Python3-Version: >= 3.3
- 
-Package: thrift-compiler
-Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: Compiler for Thrift definition files
- Thrift is a software framework for scalable cross-language services
- development. It combines a software stack with a code generation engine to
- build services that work efficiently and seamlessly.
- .
- This package contains the Thrift compiler that is used for translating
- from .thrift files (containing the definitions) to the language binding
- for the supported languages.
-
-Package: python-thrift
-Architecture: any
-Section: python
-Depends: ${python:Depends}, ${shlibs:Depends}, ${misc:Depends}, python-six
-Recommends: python-twisted-web, python-backports.ssl-match-hostname, python-ipaddress
-Provides: ${python:Provides}
-Description: Python bindings for Thrift (Python 2)
- Thrift is a software framework for scalable cross-language services
- development. It combines a software stack with a code generation engine to
- build services that work efficiently and seamlessly.
- .
- This package contains the Python bindings for Thrift. You will need the thrift
- tool (in the thrift-compiler package) to compile your definition to Python
- classes, and then the modules in this package will allow you to use those
- classes in your programs.
- .
- This package installs the library for Python 2.
-
-Package: python-thrift-dbg
-Architecture: any
-Section: debug
-Depends: ${shlibs:Depends}, ${misc:Depends}, python-thrift (= ${binary:Version}), python-all-dbg
-Provides: ${python:Provides}
-Description: Python bindings for Thrift (debug version)
- Thrift is a software framework for scalable cross-language services
- development. It combines a software stack with a code generation engine to
- build services that work efficiently and seamlessly.
- .
- This package contains the Python bindings for Thrift with debugging symbols.
- You will need the thrift tool (in the thrift-compiler package) to compile your
- definition to Python classes, and then the modules in this package will allow
- you to use those classes in your programs.
-
-Package: python3-thrift
-Architecture: any
-Section: python
-Depends: ${python3:Depends}, ${shlibs:Depends}, ${misc:Depends}, python3-six
-Recommends: python3-twisted-web
-Provides: ${python:Provides}
-Description: Python bindings for Thrift (Python 3)
- Thrift is a software framework for scalable cross-language services
- development. It combines a software stack with a code generation engine to
- build services that work efficiently and seamlessly.
- .
- This package contains the Python bindings for Thrift. You will need the thrift
- tool (in the thrift-compiler package) to compile your definition to Python
- classes, and then the modules in this package will allow you to use those
- classes in your programs.
- .
- This package installs the library for Python 3.
-
-Package: python3-thrift-dbg
-Architecture: any
-Section: debug
-Depends: ${shlibs:Depends}, ${misc:Depends}, python3-thrift (= ${binary:Version}), python3-all-dbg
-Provides: ${python:Provides}
-Description: Python bindings for Thrift (debug version)
- Thrift is a software framework for scalable cross-language services
- development. It combines a software stack with a code generation engine to
- build services that work efficiently and seamlessly.
- .
- This package contains the Python bindings for Thrift with debugging symbols.
- You will need the thrift tool (in the thrift-compiler package) to compile your
- definition to Python classes, and then the modules in this package will allow
- you to use those classes in your programs.
-
-Package: ruby-thrift
-Architecture: all
-Section: libs
-Depends: ruby | ruby-interpreter, ${shlibs:Depends}, ${misc:Depends}
-Provides: libthrift-ruby
-Replaces: libthrift-ruby
-Breaks: libthrift-ruby
-Description: Ruby bindings for Thrift
- Thrift is a software framework for scalable cross-language services
- development. It combines a software stack with a code generation engine to
- build services that work efficiently and seamlessly.
- .
- This package contains the Ruby bindings for Thrift. You will need the thrift
- tool (in the thrift-compiler package) to compile your definition to Ruby
- classes, and then the modules in this package will allow you to use those
- classes in your programs.
-
-Package: libthrift-java
-Architecture: all
-Section: java
-Depends: ${misc:Depends}
-Description: Java bindings for Thrift
- Thrift is a software framework for scalable cross-language services
- development. It combines a software stack with a code generation engine to
- build services that work efficiently and seamlessly.
- .
- This package contains the Java bindings for Thrift. You will need the thrift
- tool (in the thrift-compiler package) to compile your definition to Java
- classes, and then the modules in this package will allow you to use those
- classes in your programs.
-
-Package: libthrift-cil
-Architecture: all
-Section: cli-mono
-Depends: cli-common, libmono-corlib4.0-cil (>= 2.10) | libmono-corlib4.5-cil (>=3.2), libmono-system4.0-cil (>= 2.10),
-    libmono-system-web4.0-cil (>= 2.10), ${misc:Depends}
-Description: CLI bindings for Thrift
- Thrift is a software framework for scalable cross-language services
- development. It combines a software stack with a code generation engine to
- build services that work efficiently and seamlessly.
- .
- This package contains the CLI bindings for Thrift. You will need the thrift
- tool (in the thrift-compiler package) to compile your definition to C#
- classes, and then the modules in this package will allow you to use those
- classes in your programs.
-
-Package: libthrift-perl
-Architecture: all
-Section: perl
-Depends: perl (>= 5.8.0-7), ${misc:Depends}
-Description: Perl bindings for Thrift
- Thrift is a software framework for scalable cross-language services
- development. It combines a software stack with a code generation engine to
- build services that work efficiently and seamlessly.
- .
- This package contains the Perl bindings for Thrift. You will need the thrift
- tool (in the thrift-compiler package) to compile your definition to Perl
- classes, and then the modules in this package will allow you to use those
- classes in your programs.
-
- Package: libthrift0
- Architecture: any
- Depends: ${shlibs:Depends}, ${misc:Depends}
-@@ -177,15 +35,3 @@
-  .
-  This package contains the development libraries required for writing C++
-  applications using Thrift.
-
-Package: php5-thrift
-Architecture: any
-Section: php
-Depends: ${php:Depends}, ${shlibs:Depends}, ${misc:Depends}
-Provides: ${php:Provides}
-Description: PHP bindings for Thrift
- Thrift is a software framework for scalable cross-language services
- development. It combines a software stack with a code generation engine to
- build services that work efficiently and seamlessly.
- .
- This package contains the PHP bindings for Thrift.
-diff -Nur debian/libthrift0.install ../thrift-0.10.0.new/debian/libthrift0.install
--- debian/libthrift0.install	2016-04-03 17:19:43.000000000 +0000
-+++ ../thrift-0.10.0.new/debian/libthrift0.install	2017-06-20 19:22:46.321957388 +0000
-@@ -1,5 +1,4 @@
- usr/lib/libthrift.so*
- usr/lib/libthrift-*.so*
- usr/lib/libthriftnb*.so*
-usr/lib/libthriftqt*.so*
- usr/lib/libthriftz*.so*
-diff -Nur debian/rules ../thrift-0.10.0.new/debian/rules
--- debian/rules	2016-12-19 20:05:45.000000000 +0000
-+++ ../thrift-0.10.0.new/debian/rules	2017-06-20 19:22:48.285956469 +0000
-@@ -51,18 +51,6 @@
- 	# Compile C (glib) library
- 	$(MAKE) -C $(CURDIR)/lib/c_glib
- 
-	# Python library
-	cd $(CURDIR)/lib/py && \
-    for py in $(PYVERS); do  \
-        $$py setup.py build; \
-        $$py-dbg setup.py build; \
-    done
-
-	# PHP
-	cd $(CURDIR)/lib/php/src/ext/thrift_protocol && \
-		phpize && \
-		./configure && $(MAKE)
-
- 	touch $@
- 
- build-indep: build-indep-stamp
-@@ -71,19 +59,6 @@
- 	# Add here commands to compile the indep part of the package.
- 	#$(MAKE) doc
- 
-	# Java
-	cd $(CURDIR)/lib/java && \
-		ant
-
-	# C#
-	$(MAKE) -C $(CURDIR)/lib/csharp
-
-	# Ruby
-	$(MAKE) -C $(CURDIR)/lib/rb
-
-	# Perl
-	$(MAKE) -C $(CURDIR)/lib/perl INSTALLDIRS=vendor
-
- 	touch $@
- 
- clean:
-@@ -96,8 +71,6 @@
- 	# Add here commands to clean up after the build process.
- 	-$(MAKE) clean
- 
-	$(CURDIR)/cleanup.sh
-
- 	dh_clean
- 
- install: install-indep install-arch
-@@ -111,29 +84,6 @@
- 	# debian/<package>-doc.
- 	#INSTALLDOC#
- 
-	# Java
-	mkdir -p $(CURDIR)/debian/libthrift-java/usr/share/java/ && \
-	cp $(CURDIR)/lib/java/build/libthrift*.jar \
-		$(CURDIR)/debian/libthrift-java/usr/share/java/
-
-	# Ruby
-	mkdir -p $(CURDIR)/debian/ruby-thrift/usr/lib/ruby/1.9.1 && \
-	cp $(CURDIR)/lib/rb/lib/thrift.rb \
-		$(CURDIR)/debian/ruby-thrift/usr/lib/ruby/1.9.1
-	cp -r $(CURDIR)/lib/rb/lib/thrift \
-		$(CURDIR)/debian/ruby-thrift/usr/lib/ruby/1.9.1
-
-	# C#
-	mkdir -p $(CURDIR)/debian/libthrift-cil/usr/lib/cli/thrift/ && \
-	cp $(CURDIR)/lib/csharp/Thrift.dll \
-		$(CURDIR)/debian/libthrift-cil/usr/lib/cli/thrift/Thrift.dll
-
-	# Perl
-	$(MAKE) -C $(CURDIR)/lib/perl install DESTDIR=$(CURDIR)/debian/libthrift-perl
-	mv $(CURDIR)/debian/libthrift-perl/usr/local/lib/perl5 $(CURDIR)/debian/libthrift-perl/usr/lib
-	rmdir $(CURDIR)/debian/libthrift-perl/usr/local/lib
-	rmdir $(CURDIR)/debian/libthrift-perl/usr/local
-
- 	dh_install -i
- 
- install-arch:
-@@ -146,40 +96,6 @@
- 	# debian/tmp.
- 	#$(MAKE) DESTDIR=$(CURDIR)/debian/thrift install
- 
-	# Compiler
-	mkdir -p $(CURDIR)/debian/thrift-compiler/usr/bin && \
-	cp $(CURDIR)/compiler/cpp/thrift \
-		$(CURDIR)/debian/thrift-compiler/usr/bin/thrift && \
-	rmdir $(CURDIR)/debian/thrift-compiler/usr/sbin
-
-	# Python
-	cd $(CURDIR)/lib/py && \
-	python2 setup.py install --install-layout=deb --no-compile --root=$(CURDIR)/debian/python-thrift && \
-	python2-dbg setup.py install --install-layout=deb --no-compile --root=$(CURDIR)/debian/python-thrift-dbg && \
-	python3 setup.py install --install-layout=deb --no-compile --root=$(CURDIR)/debian/python3-thrift && \
-	python3-dbg setup.py install --install-layout=deb --no-compile --root=$(CURDIR)/debian/python3-thrift-dbg
-
-	find $(CURDIR)/debian/python-thrift -name "*.py[co]" -print0 | xargs -0 rm -f
-	find $(CURDIR)/debian/python-thrift -name "__pycache__" -print0 | xargs -0 rm -fr
-	find $(CURDIR)/debian/python-thrift-dbg -name "__pycache__" -print0 | xargs -0 rm -fr
-	find $(CURDIR)/debian/python-thrift-dbg -name "*.py[co]" -print0 | xargs -0 rm -f
-	find $(CURDIR)/debian/python-thrift-dbg -name "*.py" -print0 | xargs -0 rm -f
-	find $(CURDIR)/debian/python-thrift-dbg -name "*.egg-info" -print0 | xargs -0 rm -rf
-	find $(CURDIR)/debian/python-thrift-dbg -depth -type d -empty -exec rmdir {} \;
-
-	find $(CURDIR)/debian/python3-thrift -name "*.py[co]" -print0 | xargs -0 rm -f
-	find $(CURDIR)/debian/python3-thrift -name "__pycache__" -print0 | xargs -0 rm -fr
-	find $(CURDIR)/debian/python3-thrift-dbg -name "__pycache__" -print0 | xargs -0 rm -fr
-	find $(CURDIR)/debian/python3-thrift-dbg -name "*.py[co]" -print0 | xargs -0 rm -f
-	find $(CURDIR)/debian/python3-thrift-dbg -name "*.py" -print0 | xargs -0 rm -f
-	find $(CURDIR)/debian/python3-thrift-dbg -name "*.egg-info" -print0 | xargs -0 rm -rf
-	find $(CURDIR)/debian/python3-thrift-dbg -depth -type d -empty -exec rmdir {} \;
-
-	# PHP
-	mkdir -p $(CURDIR)/debian/php5-thrift
-	cd $(CURDIR)/lib/php && \
-		$(MAKE) DESTDIR=$(CURDIR)/debian/php5-thrift install
-
- 	# C++ and C (glib)
- 	mkdir -p $(CURDIR)/debian/tmp; \
- 	cd $(CURDIR)/lib/cpp && \
-@@ -201,9 +117,8 @@
- 	dh_installexamples
- 	dh_installman
- 	dh_link
-	dh_strip -ppython-thrift --dbg-package=python-thrift-dbg
-	dh_strip -ppython3-thrift --dbg-package=python3-thrift-dbg
- 	dh_strip -pthrift-compiler -plibthrift0
-+	dh_strip
- 	dh_compress
- 	dh_fixperms
- 	dh_makeshlibs
-@@ -219,7 +134,6 @@
- 
- # Build architecture dependent packages using the common target.
- binary-arch: build-arch install-arch
-	echo "php:Depends=phpapi-$(shell php-config5 --phpapi)" > $(CURDIR)/debian/substvars
- 	$(MAKE) -f debian/rules DH_OPTIONS=-s binary-common
- 
- binary: binary-arch binary-indep
--- a/dist/debian/pbuilderrc
+++ b/dist/debian/pbuilderrc
@@ -4,24 +4,23 @@ BASETGZ="/var/cache/pbuilder/scylla-server-$DIST.tgz"
 DISTRIBUTION="$DIST"
 BUILDRESULT="/var/cache/pbuilder/scylla-server-$DIST/result/"
 APTCACHE="/var/cache/pbuilder/scylla-server-$DIST/aptcache/"
-ALLOWUNTRUSTED=yes
 EXTRAPACKAGES="sudo"
-if [ $REBUILD -eq 1 ]; then
-    BINDMOUNTS="/var/tmp/pbuilder"
-fi

-if [ "$DIST" = "trusty" ] || [ "$DIST" = "xenial" ] || [ "$DIST" = "yakkety" ] || [ "$DIST" = "zesty" ] || [ "$DIST" = "artful" ]; then
+if [ "$DIST" = "trusty" ] || [ "$DIST" = "xenial" ] || [ "$DIST" = "yakkety" ] || [ "$DIST" = "zesty" ] || [ "$DIST" = "artful" ] || [ "$DIST" = "bionic" ]; then
    MIRRORSITE="http://archive.ubuntu.com/ubuntu/"
    COMPONENTS="main restricted universe multiverse"
    DEBOOTSTRAPOPTS="--keyring=/usr/share/keyrings/ubuntu-archive-keyring.gpg"
-    OTHERMIRROR="deb http://archive.ubuntu.com/ubuntu/ $DIST-updates main restricted universe multiverse|deb http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu $DIST main|deb [arch=amd64] http://ppa.launchpad.net/scylladb/ppa/ubuntu $DIST main"
-elif [ "$DIST" = "jessie" ] || [ "$DIST" = "stretch" ] || [ "$DIST" = "buster" ] || [ "$DIST" = "sid" ]; then
+    OTHERMIRROR="deb http://archive.ubuntu.com/ubuntu/ $DIST-updates main restricted universe multiverse"
+elif [ "$DIST" = "jessie" ]; then
    MIRRORSITE="http://deb.debian.org/debian/"
    COMPONENTS="main contrib non-free"
    DEBOOTSTRAPOPTS="--keyring=/usr/share/keyrings/debian-archive-keyring.gpg"
-    if [ $REBUILD -eq 0 ]; then
-        OTHERMIRROR="deb [arch=amd64] http://downloads.scylladb.com/deb/3rdparty/$DIST $DIST scylladb/non-free"
-    fi
+    OTHERMIRROR="deb [arch=amd64] http://download.opensuse.org/repositories/home:/scylladb:/scylla-3rdparty-jessie/Debian_8.0/ ./"
+elif [ "$DIST" = "stretch" ]; then
+    MIRRORSITE="http://deb.debian.org/debian/"
+    COMPONENTS="main contrib non-free"
+    DEBOOTSTRAPOPTS="--keyring=/usr/share/keyrings/debian-archive-keyring.gpg"
+    OTHERMIRROR="deb [arch=amd64] http://download.opensuse.org/repositories/home:/scylladb:/scylla-3rdparty-stretch/Debian_9.0/ ./"
 else
    echo "Unknown distribution: $DIST"
    exit 1
--- a/dist/debian/rules.in
+++ b/dist/debian/rules.in
@@ -1,10 +1,13 @@
 #!/usr/bin/make -f

+export PYBUILD_DISABLE=1
+jobs := $(shell echo $$DEB_BUILD_OPTIONS | sed -r "s/.*parallel=([0-9]+).*/-j\1/")
+
 override_dh_auto_configure:
-	./configure.py --enable-dpdk --mode=release --static-stdc++ --static-thrift --static-boost --compiler=@@COMPILER@@ --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib" --ldflags="-L/opt/scylladb/lib/x86_64-linux-gnu/"
+	./configure.py --enable-dpdk --mode=release --static-thrift --static-boost --compiler=@@COMPILER@@ --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"

 override_dh_auto_build:
-	PATH="/opt/scylladb/bin:$$PATH" ninja
+	PATH="/opt/scylladb/bin:$$PATH" ninja $(jobs)

 override_dh_auto_clean:
 	rm -rf build/release seastar/build
@@ -13,8 +16,8 @@ override_dh_auto_clean:

 override_dh_installinit:
 	dh_installinit --no-start @@DH_INSTALLINIT@@
-	dh_installinit --no-start --name scylla-housekeeping-daily @@DH_INSTALLINIT@@
-	dh_installinit --no-start --name scylla-housekeeping-restart @@DH_INSTALLINIT@@
+	@@INSTALL_HK_DAILY_INIT@@
+	@@INSTALL_HK_RESTART_INIT@@
 	dh_installinit --no-start --name scylla-fstrim @@DH_INSTALLINIT@@
 	dh_installinit --no-start --name node-exporter @@DH_INSTALLINIT@@

--- a/dist/debian/ubuntu_enable_ppa.sh
+++ b/dist/debian/ubuntu_enable_ppa.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+apt install -y software-properties-common
+add-apt-repository -y ppa:scylladb/ppa
+apt update
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -14,8 +14,10 @@ ADD etc/sysconfig/scylla-server /etc/sysconfig/scylla-server
 # Supervisord configuration:
 ADD etc/supervisord.conf /etc/supervisord.conf
 ADD etc/supervisord.conf.d/scylla-server.conf /etc/supervisord.conf.d/scylla-server.conf
+ADD etc/supervisord.conf.d/scylla-housekeeping.conf /etc/supervisord.conf.d/scylla-housekeeping.conf
 ADD etc/supervisord.conf.d/scylla-jmx.conf /etc/supervisord.conf.d/scylla-jmx.conf
 ADD scylla-service.sh /scylla-service.sh
+ADD scylla-housekeeping-service.sh /scylla-housekeeping-service.sh
 ADD scylla-jmx-service.sh /scylla-jmx-service.sh

 # Docker image startup scripts:
@@ -24,7 +26,7 @@ ADD commandlineparser.py /commandlineparser.py
 ADD docker-entrypoint.py /docker-entrypoint.py

 # Install Scylla:
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo && \
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-2.1.repo -o /etc/yum.repos.d/scylla.repo && \
    yum -y install epel-release && \
    yum -y clean expire-cache && \
    yum -y update && \
--- a/dist/docker/redhat/commandlineparser.py
+++ b/dist/docker/redhat/commandlineparser.py
@@ -14,4 +14,5 @@ def parse():
    parser.add_argument('--broadcast-address', default=None, dest='broadcastAddress')
    parser.add_argument('--broadcast-rpc-address', default=None, dest='broadcastRpcAddress')
    parser.add_argument('--api-address', default=None, dest='apiAddress')
+    parser.add_argument('--disable-version-check', default=False, action='store_true', dest='disable_housekeeping', help="Disable version check")
    return parser.parse_args()
--- a/dist/docker/redhat/docker-entrypoint.py
+++ b/dist/docker/redhat/docker-entrypoint.py
@@ -15,6 +15,7 @@ try:
    setup.io()
    setup.cqlshrc()
    setup.arguments()
+    setup.set_housekeeping()
    os.system("/usr/bin/supervisord -c /etc/supervisord.conf")
 except:
    logging.exception('failed!')
--- a/dist/docker/redhat/etc/supervisord.conf.d/scylla-housekeeping.conf
+++ b/dist/docker/redhat/etc/supervisord.conf.d/scylla-housekeeping.conf
@@ -0,0 +1,6 @@
+[program:scylla-housekeeping]
+command=/scylla-housekeeping-service.sh
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
--- a/dist/docker/redhat/scylla-housekeeping-service.sh
+++ b/dist/docker/redhat/scylla-housekeeping-service.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+sleep 5
+/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/yum.repos.d/scylla*.repo'  -q version --mode cr || true
+while true; do
+    sleep 1d
+    /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/yum.repos.d/scylla*.repo' -q version --mode cd || true
+done
+
--- a/dist/docker/redhat/scyllasetup.py
+++ b/dist/docker/redhat/scyllasetup.py
@@ -15,6 +15,7 @@ class ScyllaSetup:
        self._smp = arguments.smp
        self._memory = arguments.memory
        self._overprovisioned = arguments.overprovisioned
+        self._housekeeping = not arguments.disable_housekeeping
        self._experimental = arguments.experimental

    def _run(self, *args, **kwargs):
@@ -38,6 +39,14 @@ class ScyllaSetup:
        with open("%s/.cqlshrc" % home, "w") as cqlshrc:
            cqlshrc.write("[connection]\nhostname = %s\n" % hostname)

+    def set_housekeeping(self):
+        with open("/etc/scylla.d/housekeeping.cfg", "w") as f:
+            f.write("[housekeeping]\ncheck-version: ")
+            if self._housekeeping:
+                f.write("True\n")
+            else:
+                f.write("False\n")
+
    def arguments(self):
        args = []
        if self._memory is not None:
--- a/dist/redhat/build_rpm.sh
+++ b/dist/redhat/build_rpm.sh
@@ -102,11 +102,11 @@ fi


 if [ $JOBS -gt 0 ]; then
-    SRPM_OPTS="$SRPM_OPTS --define='_smp_mflags -j$JOBS'"
+    RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
 fi
-sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS
+sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
 if [ "$TARGET" = "epel-7-x86_64" ]; then
    TARGET=scylla-$TARGET
    RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
 fi
-sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS build/srpms/scylla-$VERSION*.src.rpm
+sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/scylla-$VERSION*.src.rpm
--- a/dist/redhat/mock/scylla-epel-7-x86_64.cfg
+++ b/dist/redhat/mock/scylla-epel-7-x86_64.cfg
@@ -21,53 +21,34 @@ mdpolicy=group:primary
 best=1

 # repos
-[base]
+[scylla-centos-base]
 name=BaseOS
 mirrorlist=http://mirrorlist.centos.org/?release=7&arch=x86_64&repo=os
 failovermethod=priority
-gpgkey=file:///usr/share/distribution-gpg-keys/centos/RPM-GPG-KEY-CentOS-7
+gpgkey=http://vault.centos.org/RPM-GPG-KEY-CentOS-7
 gpgcheck=1

-[updates]
+[scylla-centos-updates]
 name=updates
 enabled=1
 mirrorlist=http://mirrorlist.centos.org/?release=7&arch=x86_64&repo=updates
 failovermethod=priority
-gpgkey=file:///usr/share/distribution-gpg-keys/centos/RPM-GPG-KEY-CentOS-7
+gpgkey=http://vault.centos.org/RPM-GPG-KEY-CentOS-7
 gpgcheck=1

-[epel]
-name=epel
-mirrorlist=http://mirrors.fedoraproject.org/mirrorlist?repo=epel-7&arch=x86_64
-failovermethod=priority
-gpgkey=file:///usr/share/distribution-gpg-keys/epel/RPM-GPG-KEY-EPEL-7
-gpgcheck=1
-
-[extras]
+[scylla-centos-extras]
 name=extras
 mirrorlist=http://mirrorlist.centos.org/?release=7&arch=x86_64&repo=extras
 failovermethod=priority
-gpgkey=file:///usr/share/distribution-gpg-keys/centos/RPM-GPG-KEY-CentOS-7
+gpgkey=http://vault.centos.org/RPM-GPG-KEY-CentOS-7
 gpgcheck=1

-[testing]
-name=epel-testing
-enabled=0
-mirrorlist=http://mirrors.fedoraproject.org/mirrorlist?repo=testing-epel7&arch=x86_64
+[scylla-epel]
+name=epel
+mirrorlist=http://mirrors.fedoraproject.org/mirrorlist?repo=epel-7&arch=x86_64
 failovermethod=priority
-
-
-[local]
-name=local
-baseurl=https://kojipkgs.fedoraproject.org/repos/epel7-build/latest/x86_64/
-cost=2000
-enabled=0
-
-[epel-debuginfo]
-name=epel-debug
-mirrorlist=http://mirrors.fedoraproject.org/mirrorlist?repo=epel-debug-7&arch=x86_64
-failovermethod=priority
-enabled=0
+gpgkey=https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-7
+gpgcheck=1

 [scylladb-scylla-3rdparty]
 name=Copr repo for scylla-3rdparty owned by scylladb
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -7,14 +7,14 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{name}-@@VERSION@@-@@RELEASE@@.tar
-Requires:       scylla-server = @@VERSION@@ scylla-jmx = @@VERSION@@ scylla-tools = @@VERSION@@ scylla-kernel-conf = @@VERSION@@ scylla-libgcc72 scylla-libstdc++72
+Requires:       scylla-server = @@VERSION@@ scylla-jmx = @@VERSION@@ scylla-tools = @@VERSION@@ scylla-tools-core = @@VERSION@@ scylla-kernel-conf = @@VERSION@@ scylla-libgcc72 scylla-libstdc++72
 Obsoletes:	scylla-server < 1.1

 %description
 Scylla is a highly scalable, eventually consistent, distributed,
 partitioned row DB.
 This package installs all required packages for ScyllaDB,  including
-scylla-server, scylla-jmx, scylla-tools.
+scylla-server, scylla-jmx, scylla-tools, scylla-tools-core.

 # this is needed to prevent python compilation error on CentOS (#2235)
 %if 0%{?rhel}
@@ -78,6 +78,10 @@ python3.4 ./configure.py --enable-dpdk --mode=release --static-boost --compiler=
 ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
 cp dist/common/systemd/scylla-server.service.in build/scylla-server.service
 sed -i -e "s#@@SYSCONFDIR@@#/etc/sysconfig#g" build/scylla-server.service
+cp dist/common/systemd/scylla-housekeeping-restart.service.in build/scylla-housekeeping-restart.service
+sed -i -e "s#@@REPOFILES@@#'/etc/yum.repos.d/scylla*.repo'#g" build/scylla-housekeeping-restart.service
+cp dist/common/systemd/scylla-housekeeping-daily.service.in build/scylla-housekeeping-daily.service
+sed -i -e "s#@@REPOFILES@@#'/etc/yum.repos.d/scylla*.repo'#g" build/scylla-housekeeping-daily.service

 %install
 rm -rf $RPM_BUILD_ROOT
@@ -88,9 +92,6 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
-%if 0%{?rhel}
-mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
-%endif
 mkdir -p $RPM_BUILD_ROOT%{_sysctldir}/
 mkdir -p $RPM_BUILD_ROOT%{_docdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_unitdir}
@@ -101,9 +102,6 @@ install -m644 dist/common/limits.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/sec
 install -m644 dist/common/collectd.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 install -m644 dist/common/scylla.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
 install -m644 dist/common/sysctl.d/*.conf $RPM_BUILD_ROOT%{_sysctldir}/
-%if 0%{?rhel}
-install -m644 dist/common/modprobe.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
-%endif
 install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
 install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
@@ -296,18 +294,9 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
-# Write modprobe.d params when module already loaded
-%if 0%{?rhel}
-if [ -e /sys/module/raid0/parameters/devices_discard_performance ]; then
-    echo Y > /sys/module/raid0/parameters/devices_discard_performance
-fi
-%endif

 %files kernel-conf
 %defattr(-,root,root)
-%if 0%{?rhel}
-%config(noreplace) %{_sysconfdir}/modprobe.d/*.conf
-%endif
 %{_sysctldir}/*.conf

 %changelog
--- a/docs/docker-hub.md
+++ b/docs/docker-hub.md
@@ -208,6 +208,12 @@ $ docker run --name some-scylla -d scylladb/scylla --experimental 1

 **Since: 2.0**

+### `--disable-version-check`
+
+The `--disable-version-check` disable the version validation check.
+
+**Since: 2.2**
+
 # User Feedback

 ## Issues
--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -21,6 +21,7 @@

 #include "flat_mutation_reader.hh"
 #include "mutation_reader.hh"
+#include "seastar/util/reference_wrapper.hh"
 #include <algorithm>

 #include <boost/range/adaptor/transformed.hpp>
@@ -43,6 +44,132 @@ void flat_mutation_reader::impl::clear_buffer_to_next_partition() {
    _buffer_size = boost::accumulate(_buffer | boost::adaptors::transformed(std::mem_fn(&mutation_fragment::memory_usage)), size_t(0));
 }

+flat_mutation_reader flat_mutation_reader::impl::reverse_partitions(flat_mutation_reader::impl& original) {
+    // FIXME: #1413 Full partitions get accumulated in memory.
+
+    class partition_reversing_mutation_reader final : public flat_mutation_reader::impl {
+        flat_mutation_reader::impl* _source;
+        range_tombstone_list _range_tombstones;
+        std::stack<mutation_fragment> _mutation_fragments;
+        mutation_fragment_opt _partition_end;
+    private:
+        stop_iteration emit_partition() {
+            auto emit_range_tombstone = [&] {
+                auto it = std::prev(_range_tombstones.tombstones().end());
+                auto& rt = *it;
+                _range_tombstones.tombstones().erase(it);
+                auto rt_owner = alloc_strategy_unique_ptr<range_tombstone>(&rt);
+                push_mutation_fragment(mutation_fragment(std::move(rt)));
+            };
+            position_in_partition::less_compare cmp(*_source->_schema);
+            while (!_mutation_fragments.empty() && !is_buffer_full()) {
+                auto& mf = _mutation_fragments.top();
+                if (!_range_tombstones.empty() && !cmp(_range_tombstones.tombstones().rbegin()->end_position(), mf.position())) {
+                    emit_range_tombstone();
+                } else {
+                    push_mutation_fragment(std::move(mf));
+                    _mutation_fragments.pop();
+                }
+            }
+            while (!_range_tombstones.empty() && !is_buffer_full()) {
+                emit_range_tombstone();
+            }
+            if (is_buffer_full()) {
+                return stop_iteration::yes;
+            }
+            push_mutation_fragment(*std::exchange(_partition_end, stdx::nullopt));
+            return stop_iteration::no;
+        }
+        future<stop_iteration> consume_partition_from_source() {
+            if (_source->is_buffer_empty()) {
+                if (_source->is_end_of_stream()) {
+                    _end_of_stream = true;
+                    return make_ready_future<stop_iteration>(stop_iteration::yes);
+                }
+                return _source->fill_buffer().then([] { return stop_iteration::no; });
+            }
+            while (!_source->is_buffer_empty() && !is_buffer_full()) {
+                auto mf = _source->pop_mutation_fragment();
+                if (mf.is_partition_start() || mf.is_static_row()) {
+                    push_mutation_fragment(std::move(mf));
+                } else if (mf.is_end_of_partition()) {
+                    _partition_end = std::move(mf);
+                    if (emit_partition()) {
+                        return make_ready_future<stop_iteration>(stop_iteration::yes);
+                    }
+                } else if (mf.is_range_tombstone()) {
+                    _range_tombstones.apply(*_source->_schema, std::move(mf.as_range_tombstone()));
+                } else {
+                    _mutation_fragments.emplace(std::move(mf));
+                }
+            }
+            return make_ready_future<stop_iteration>(is_buffer_full());
+        }
+    public:
+        explicit partition_reversing_mutation_reader(flat_mutation_reader::impl& mr)
+            : flat_mutation_reader::impl(mr._schema)
+            , _source(&mr)
+            , _range_tombstones(*mr._schema)
+        { }
+
+        virtual future<> fill_buffer() override {
+            return repeat([&] {
+                if (_partition_end) {
+                    // We have consumed full partition from source, now it is
+                    // time to emit it.
+                    auto stop = emit_partition();
+                    if (stop) {
+                        return make_ready_future<stop_iteration>(stop_iteration::yes);
+                    }
+                }
+                return consume_partition_from_source();
+            });
+        }
+
+        virtual void next_partition() override {
+            clear_buffer_to_next_partition();
+            if (is_buffer_empty() && !is_end_of_stream()) {
+                while (!_mutation_fragments.empty()) {
+                    _mutation_fragments.pop();
+                }
+                _range_tombstones.clear();
+                _partition_end = stdx::nullopt;
+                _source->next_partition();
+            }
+        }
+
+        virtual future<> fast_forward_to(const dht::partition_range&) override {
+            throw std::bad_function_call();
+        }
+
+        virtual future<> fast_forward_to(position_range) override {
+            throw std::bad_function_call();
+        }
+    };
+
+    return make_flat_mutation_reader<partition_reversing_mutation_reader>(original);
+}
+
+template<typename Source>
+future<bool> flat_mutation_reader::impl::fill_buffer_from(Source& source) {
+    if (source.is_buffer_empty()) {
+        if (source.is_end_of_stream()) {
+            return make_ready_future<bool>(true);
+        }
+        return source.fill_buffer().then([this, &source] {
+            return fill_buffer_from(source);
+        });
+    } else {
+        while (!source.is_buffer_empty() && !is_buffer_full()) {
+            push_mutation_fragment(source.pop_mutation_fragment());
+        }
+        return make_ready_future<bool>(source.is_end_of_stream() && source.is_buffer_empty());
+    }
+}
+
+template future<bool> flat_mutation_reader::impl::fill_buffer_from<streamed_mutation>(streamed_mutation&);
+template future<bool> flat_mutation_reader::impl::fill_buffer_from<flat_mutation_reader>(flat_mutation_reader&);
+
 flat_mutation_reader flat_mutation_reader_from_mutation_reader(schema_ptr s, mutation_reader&& legacy_reader, streamed_mutation::forwarding fwd) {
    class converting_reader final : public flat_mutation_reader::impl {
        mutation_reader _legacy_reader;
@@ -77,21 +204,11 @@ flat_mutation_reader flat_mutation_reader_from_mutation_reader(schema_ptr s, mut
                if (!_sm) {
                    return get_next_sm();
                } else {
-                    if (_sm->is_buffer_empty()) {
-                        if (_sm->is_end_of_stream()) {
-                            on_sm_finished();
-                            return make_ready_future<>();
-                        }
-                        return _sm->fill_buffer();
-                    } else {
-                        while (!_sm->is_buffer_empty() && !is_buffer_full()) {
-                            this->push_mutation_fragment(_sm->pop_mutation_fragment());
-                        }
-                        if (_sm->is_end_of_stream() && _sm->is_buffer_empty()) {
+                    return fill_buffer_from(*_sm).then([this] (bool sm_finished) {
+                        if (sm_finished) {
                            on_sm_finished();
                        }
-                        return make_ready_future<>();
-                    }
+                    });
                }
            });
        }
@@ -119,13 +236,45 @@ flat_mutation_reader flat_mutation_reader_from_mutation_reader(schema_ptr s, mut
            if (_sm) {
                return _sm->fast_forward_to(std::move(cr));
            } else {
-                throw std::runtime_error("fast forward needs _sm to be set");
+                _end_of_stream = true;
+                return make_ready_future<>();
            }
        };
    };
    return make_flat_mutation_reader<converting_reader>(std::move(s), std::move(legacy_reader), fwd);
 }

+flat_mutation_reader make_delegating_reader(flat_mutation_reader& r) {
+    class reader : public flat_mutation_reader::impl {
+        reference_wrapper<flat_mutation_reader> _underlying;
+    public:
+        reader(flat_mutation_reader& r) : impl(r.schema()), _underlying(ref(r)) { }
+        virtual future<> fill_buffer() override {
+            return fill_buffer_from(_underlying.get()).then([this] (bool underlying_finished) {
+                _end_of_stream = underlying_finished;
+            });
+        }
+        virtual future<> fast_forward_to(position_range pr) override {
+            _end_of_stream = false;
+            forward_buffer_to(pr.start());
+            return _underlying.get().fast_forward_to(std::move(pr));
+        }
+        virtual void next_partition() override {
+            clear_buffer_to_next_partition();
+            if (is_buffer_empty()) {
+                _underlying.get().next_partition();
+            }
+            _end_of_stream = _underlying.get().is_end_of_stream() && _underlying.get().is_buffer_empty();
+        }
+        virtual future<> fast_forward_to(const dht::partition_range& pr) override {
+            _end_of_stream = false;
+            clear_buffer();
+            return _underlying.get().fast_forward_to(pr);
+        }
+    };
+    return make_flat_mutation_reader<reader>(r);
+}
+
 flat_mutation_reader make_forwardable(flat_mutation_reader m) {
    class reader : public flat_mutation_reader::impl {
        flat_mutation_reader _underlying;
@@ -190,14 +339,78 @@ flat_mutation_reader make_forwardable(flat_mutation_reader m) {
            };
        }
        virtual future<> fast_forward_to(const dht::partition_range& pr) override {
+            _end_of_stream = false;
            clear_buffer();
            _next = {};
+            _current = {
+                position_in_partition(position_in_partition::partition_start_tag_t()),
+                position_in_partition(position_in_partition::after_static_row_tag_t())
+            };
            return _underlying.fast_forward_to(pr);
        }
    };
    return make_flat_mutation_reader<reader>(std::move(m));
 }

+flat_mutation_reader make_nonforwardable(flat_mutation_reader r, bool single_partition) {
+    class reader : public flat_mutation_reader::impl {
+        flat_mutation_reader _underlying;
+        bool _single_partition;
+        bool _static_row_done = false;
+        bool is_end_end_of_underlying_stream() const {
+            return _underlying.is_buffer_empty() && _underlying.is_end_of_stream();
+        }
+        future<> on_end_of_underlying_stream() {
+            if (!_static_row_done) {
+                _static_row_done = true;
+                return _underlying.fast_forward_to(position_range::all_clustered_rows());
+            }
+            push_mutation_fragment(partition_end());
+            if (_single_partition) {
+                _end_of_stream = true;
+                return make_ready_future<>();
+            }
+            _underlying.next_partition();
+            _static_row_done = false;
+            return _underlying.fill_buffer().then([this] {
+                _end_of_stream = is_end_end_of_underlying_stream();
+            });
+        }
+    public:
+        reader(flat_mutation_reader r, bool single_partition)
+            : impl(r.schema())
+            , _underlying(std::move(r))
+            , _single_partition(single_partition)
+        { }
+        virtual future<> fill_buffer() override {
+            return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
+                return fill_buffer_from(_underlying).then([this] (bool underlying_finished) {
+                    if (underlying_finished) {
+                        return on_end_of_underlying_stream();
+                    }
+                    return make_ready_future<>();
+                });
+            });
+        }
+        virtual future<> fast_forward_to(position_range pr) override {
+            throw std::bad_function_call();
+        }
+        virtual void next_partition() override {
+            clear_buffer_to_next_partition();
+            if (is_buffer_empty()) {
+                _underlying.next_partition();
+            }
+            _end_of_stream = is_end_end_of_underlying_stream();
+        }
+        virtual future<> fast_forward_to(const dht::partition_range& pr) override {
+            _end_of_stream = false;
+            clear_buffer();
+            return _underlying.fast_forward_to(pr);
+        }
+    };
+    return make_flat_mutation_reader<reader>(std::move(r), single_partition);
+}
+
 class empty_flat_reader final : public flat_mutation_reader::impl {
 public:
    empty_flat_reader(schema_ptr s) : impl(std::move(s)) { _end_of_stream = true; }
@@ -373,12 +586,12 @@ private:
 public:
    flat_multi_range_mutation_reader(schema_ptr s, mutation_source source, const ranges_vector& ranges,
                                const query::partition_slice& slice, const io_priority_class& pc,
-                                tracing::trace_state_ptr trace_state, streamed_mutation::forwarding fwd,
+                                tracing::trace_state_ptr trace_state,
                                mutation_reader::forwarding fwd_mr)
        : impl(s)
        , _ranges(ranges)
        , _current_range(_ranges.begin())
-        , _reader(source.make_flat_mutation_reader(s, *_current_range, slice, pc, trace_state, fwd,
+        , _reader(source.make_flat_mutation_reader(s, *_current_range, slice, pc, trace_state, streamed_mutation::forwarding::no,
                                                   _ranges.size() > 1 ? mutation_reader::forwarding::yes : fwd_mr))
    {
    }
@@ -412,20 +625,23 @@ public:
    }

    virtual future<> fast_forward_to(position_range pr) override {
-        return _reader.fast_forward_to(std::move(pr));
+        throw std::bad_function_call();
    }

    virtual void next_partition() override {
-        return _reader.next_partition();
+        clear_buffer_to_next_partition();
+        if (is_buffer_empty() && !is_end_of_stream()) {
+            _reader.next_partition();
+        }
    }
 };

 flat_mutation_reader
 make_flat_multi_range_reader(schema_ptr s, mutation_source source, const dht::partition_range_vector& ranges,
                        const query::partition_slice& slice, const io_priority_class& pc,
-                        tracing::trace_state_ptr trace_state, streamed_mutation::forwarding fwd,
+                        tracing::trace_state_ptr trace_state,
                        mutation_reader::forwarding fwd_mr)
 {
    return make_flat_mutation_reader<flat_multi_range_mutation_reader>(std::move(s), std::move(source), ranges,
-                                                             slice, pc, std::move(trace_state), fwd, fwd_mr);
+                                                             slice, pc, std::move(trace_state), fwd_mr);
 }
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -52,6 +52,11 @@ GCC6_CONCEPT(
            obj.consume_end_of_partition();
        };
    }
+
+    template<typename T>
+    concept bool PartitionFilter = requires(T filter, const dht::decorated_key& dk) {
+        { filter(dk) } -> bool;
+    };
 )

 /*
@@ -68,18 +73,31 @@ GCC6_CONCEPT(
 */
 class flat_mutation_reader final {
 public:
+    // Causes a stream of reversed mutations to be emitted.
+    // 1. Static row is still emitted first.
+    // 2. Range tombstones are ordered by their end position.
+    // 3. Clustered rows and range tombstones are emitted in descending order.
+    // Because of 2 and 3 the guarantee that a range tombstone is emitted before
+    // any mutation fragment affected by it still holds.
+    // Ordering of partitions themselves remains unchanged.
+    using consume_reversed_partitions = seastar::bool_class<class consume_reversed_partitions_tag>;
+
    class impl {
+    private:
        circular_buffer<mutation_fragment> _buffer;
        size_t _buffer_size = 0;
        bool _consume_done = false;
    protected:
-        static constexpr size_t max_buffer_size_in_bytes = 8 * 1024;
+        size_t max_buffer_size_in_bytes = 8 * 1024;
        bool _end_of_stream = false;
        schema_ptr _schema;
        friend class flat_mutation_reader;
+        template <typename Source>
+        friend future<bool> fill_buffer_from(flat_mutation_reader::impl&, Source&);
    protected:
        template<typename... Args>
        void push_mutation_fragment(Args&&... args) {
+            seastar::memory::on_alloc_point(); // for exception safety tests
            _buffer.emplace_back(std::forward<Args>(args)...);
            _buffer_size += _buffer.back().memory_usage();
        }
@@ -89,6 +107,16 @@ public:
        }
        void forward_buffer_to(const position_in_partition& pos);
        void clear_buffer_to_next_partition();
+        template<typename Source>
+        future<bool> fill_buffer_from(Source&);
+        // When succeeds, makes sure that the next push_mutation_fragment() will not fail.
+        void reserve_one() {
+            if (_buffer.capacity() == _buffer.size()) {
+                _buffer.reserve(_buffer.size() * 2 + 1);
+            }
+        }
+    private:
+        static flat_mutation_reader reverse_partitions(flat_mutation_reader::impl&);
    public:
        impl(schema_ptr s) : _schema(std::move(s)) { }
        virtual ~impl() {}
@@ -135,6 +163,79 @@ public:
            });
        }

+        template<typename Consumer, typename Filter>
+        GCC6_CONCEPT(
+            requires FlatMutationReaderConsumer<Consumer>() && PartitionFilter<Filter>
+        )
+        // A variant of consume_pausable() that expects to be run in
+        // a seastar::thread.
+        // Partitions for which filter(decorated_key) returns false are skipped
+        // entirely and never reach the consumer.
+        void consume_pausable_in_thread(Consumer consumer, Filter filter) {
+            while (true) {
+                if (is_buffer_empty()) {
+                    if (is_end_of_stream()) {
+                        return;
+                    }
+                    fill_buffer().get();
+                    continue;
+                }
+                auto mf = pop_mutation_fragment();
+                if (mf.is_partition_start() && !filter(mf.as_partition_start().key())) {
+                    next_partition();
+                    continue;
+                }
+                if (consumer(std::move(mf)) == stop_iteration::yes) {
+                    return;
+                }
+            }
+        };
+
+    private:
+        template<typename Consumer>
+        struct consumer_adapter {
+            flat_mutation_reader::impl& _reader;
+            stdx::optional<dht::decorated_key> _decorated_key;
+            Consumer _consumer;
+            consumer_adapter(flat_mutation_reader::impl& reader, Consumer c)
+                    : _reader(reader)
+                      , _consumer(std::move(c))
+            { }
+            stop_iteration operator()(mutation_fragment&& mf) {
+                return std::move(mf).consume(*this);
+            }
+            stop_iteration consume(static_row&& sr) {
+                return handle_result(_consumer.consume(std::move(sr)));
+            }
+            stop_iteration consume(clustering_row&& cr) {
+                return handle_result(_consumer.consume(std::move(cr)));
+            }
+            stop_iteration consume(range_tombstone&& rt) {
+                return handle_result(_consumer.consume(std::move(rt)));
+            }
+            stop_iteration consume(partition_start&& ps) {
+                _decorated_key.emplace(std::move(ps.key()));
+                _consumer.consume_new_partition(*_decorated_key);
+                if (ps.partition_tombstone()) {
+                    _consumer.consume(ps.partition_tombstone());
+                }
+                return stop_iteration::no;
+            }
+            stop_iteration consume(partition_end&& pe) {
+                return _consumer.consume_end_of_partition();
+            }
+        private:
+            stop_iteration handle_result(stop_iteration si) {
+                if (si) {
+                    if (_consumer.consume_end_of_partition()) {
+                        return stop_iteration::yes;
+                    }
+                    _reader.next_partition();
+                }
+                return stop_iteration::no;
+            }
+        };
+    public:
        template<typename Consumer>
        GCC6_CONCEPT(
            requires FlattenedConsumer<Consumer>()
@@ -144,64 +245,46 @@ public:
        // When consumer returns stop_iteration::yes from methods other than consume_end_of_partition then the read
        // of the current partition is ended, consume_end_of_partition is called and if it returns stop_iteration::no
        // then the read moves to the next partition.
+        // Reference to the decorated key that is passed to consume_new_partition() remains valid until after
+        // the call to consume_end_of_partition().
        //
        // This method is useful because most of current consumers use this semantic.
        //
        //
        // This method returns whatever is returned from Consumer::consume_end_of_stream().S
        auto consume(Consumer consumer) {
-            struct consumer_adapter {
-                flat_mutation_reader::impl& _reader;
-                Consumer _consumer;
-                consumer_adapter(flat_mutation_reader::impl& reader, Consumer c)
-                    : _reader(reader)
-                    , _consumer(std::move(c))
-                { }
-                stop_iteration operator()(mutation_fragment&& mf) {
-                    return std::move(mf).consume(*this);
-                }
-                stop_iteration consume(static_row&& sr) {
-                    return handle_result(_consumer.consume(std::move(sr)));
-                }
-                stop_iteration consume(clustering_row&& cr) {
-                    return handle_result(_consumer.consume(std::move(cr)));
-                }
-                stop_iteration consume(range_tombstone&& rt) {
-                    return handle_result(_consumer.consume(std::move(rt)));
-                }
-                stop_iteration consume(partition_start&& ps) {
-                    _consumer.consume_new_partition(ps.key());
-                    if (ps.partition_tombstone()) {
-                        _consumer.consume(ps.partition_tombstone());
-                    }
-                    return stop_iteration::no;
-                }
-                stop_iteration consume(partition_end&& pe) {
-                    return _consumer.consume_end_of_partition();
-                }
-            private:
-                stop_iteration handle_result(stop_iteration si) {
-                    if (si) {
-                        if (_consumer.consume_end_of_partition()) {
-                            return stop_iteration::yes;
-                        }
-                        _reader.next_partition();
-                    }
-                    return stop_iteration::no;
-                }
-            };
-            return do_with(consumer_adapter(*this, std::move(consumer)), [this] (consumer_adapter& adapter) {
+            return do_with(consumer_adapter<Consumer>(*this, std::move(consumer)), [this] (consumer_adapter<Consumer>& adapter) {
                return consume_pausable(std::ref(adapter)).then([this, &adapter] {
                    return adapter._consumer.consume_end_of_stream();
                });
            });
        }

+        template<typename Consumer, typename Filter>
+        GCC6_CONCEPT(
+            requires FlattenedConsumer<Consumer>() && PartitionFilter<Filter>
+        )
+        // A variant of consumee() that expects to be run in a seastar::thread.
+        // Partitions for which filter(decorated_key) returns false are skipped
+        // entirely and never reach the consumer.
+        auto consume_in_thread(Consumer consumer, Filter filter) {
+            auto adapter = consumer_adapter<Consumer>(*this, std::move(consumer));
+            consume_pausable_in_thread(std::ref(adapter), std::move(filter));
+            return adapter._consumer.consume_end_of_stream();
+        };
+
+        /*
+         * fast_forward_to is forbidden on flat_mutation_reader created for a single partition.
+         */
        virtual future<> fast_forward_to(const dht::partition_range&) = 0;
        virtual future<> fast_forward_to(position_range) = 0;
    };
 private:
    std::unique_ptr<impl> _impl;
+
+    flat_mutation_reader() = default;
+    explicit operator bool() const noexcept { return bool(_impl); }
+    friend class optimized_optional<flat_mutation_reader>;
 public:
    // Documented in mutation_reader::forwarding in mutation_reader.hh.
    class partition_range_forwarding_tag;
@@ -225,10 +308,31 @@ public:
    GCC6_CONCEPT(
        requires FlattenedConsumer<Consumer>()
    )
-    auto consume(Consumer consumer) {
+    auto consume(Consumer consumer, consume_reversed_partitions reversed = consume_reversed_partitions::no) {
+        if (reversed) {
+            return do_with(impl::reverse_partitions(*_impl), [&] (auto& reversed_partition_stream) {
+                return reversed_partition_stream._impl->consume(std::move(consumer));
+            });
+        }
        return _impl->consume(std::move(consumer));
    }

+    template<typename Consumer, typename Filter>
+    GCC6_CONCEPT(
+        requires FlattenedConsumer<Consumer>() && PartitionFilter<Filter>
+    )
+    auto consume_in_thread(Consumer consumer, Filter filter) {
+        return _impl->consume_in_thread(std::move(consumer), std::move(filter));
+    }
+
+    template<typename Consumer>
+    GCC6_CONCEPT(
+        requires FlattenedConsumer<Consumer>()
+    )
+    auto consume_in_thread(Consumer consumer) {
+        return consume_in_thread(std::move(consumer), [] (const dht::decorated_key&) { return true; });
+    }
+
    void next_partition() { _impl->next_partition(); }

    future<> fill_buffer() { return _impl->fill_buffer(); }
@@ -271,8 +375,17 @@ public:
    bool is_buffer_full() const { return _impl->is_buffer_full(); }
    mutation_fragment pop_mutation_fragment() { return _impl->pop_mutation_fragment(); }
    const schema_ptr& schema() const { return _impl->_schema; }
+    void set_max_buffer_size(size_t size) {
+        _impl->max_buffer_size_in_bytes = size;
+    }
 };

+template<>
+struct move_constructor_disengages<flat_mutation_reader> {
+    enum { value = true };
+};
+using flat_mutation_reader_opt = optimized_optional<flat_mutation_reader>;
+
 template<typename Impl, typename... Args>
 flat_mutation_reader make_flat_mutation_reader(Args &&... args) {
    return flat_mutation_reader(std::make_unique<Impl>(std::forward<Args>(args)...));
@@ -280,10 +393,89 @@ flat_mutation_reader make_flat_mutation_reader(Args &&... args) {

 class mutation_reader;

+// Consumes mutation fragments until StopCondition is true.
+// The consumer will stop iff StopCondition returns true, in particular
+// reaching the end of stream alone won't stop the reader.
+template<typename StopCondition, typename ConsumeMutationFragment, typename ConsumeEndOfStream>
+GCC6_CONCEPT(requires requires(StopCondition stop, ConsumeMutationFragment consume_mf, ConsumeEndOfStream consume_eos, mutation_fragment mf) {
+    { stop() } -> bool;
+    { consume_mf(std::move(mf)) } -> void;
+    { consume_eos() } -> future<>;
+})
+future<> consume_mutation_fragments_until(flat_mutation_reader& r, StopCondition&& stop,
+                                          ConsumeMutationFragment&& consume_mf, ConsumeEndOfStream&& consume_eos) {
+    return do_until([stop] { return stop(); }, [&r, stop, consume_mf, consume_eos] {
+        while (!r.is_buffer_empty()) {
+            consume_mf(r.pop_mutation_fragment());
+            if (stop()) {
+                return make_ready_future<>();
+            }
+        }
+        if (r.is_end_of_stream()) {
+            return consume_eos();
+        }
+        return r.fill_buffer();
+    });
+}
+
+// Creates a stream which is like r but with transformation applied to the elements.
+template<typename T>
+GCC6_CONCEPT(
+    requires StreamedMutationTranformer<T>()
+)
+flat_mutation_reader transform(flat_mutation_reader r, T t) {
+    class transforming_reader : public flat_mutation_reader::impl {
+        flat_mutation_reader _reader;
+        T _t;
+        struct consumer {
+            transforming_reader* _owner;
+            stop_iteration operator()(mutation_fragment&& mf) {
+                _owner->push_mutation_fragment(_owner->_t(std::move(mf)));
+                return stop_iteration(_owner->is_buffer_full());
+            }
+        };
+    public:
+        transforming_reader(flat_mutation_reader&& r, T&& t)
+            : impl(t(r.schema()))
+            , _reader(std::move(r))
+            , _t(std::move(t))
+        {}
+        virtual future<> fill_buffer() override {
+            if (_end_of_stream) {
+                return make_ready_future<>();
+            }
+            return _reader.consume_pausable(consumer{this}).then([this] {
+                if (_reader.is_end_of_stream() && _reader.is_buffer_empty()) {
+                    _end_of_stream = true;
+                }
+            });
+        }
+        virtual void next_partition() override {
+            clear_buffer_to_next_partition();
+            if (is_buffer_empty()) {
+                _reader.next_partition();
+            }
+        }
+        virtual future<> fast_forward_to(const dht::partition_range& pr) override {
+            clear_buffer();
+            _end_of_stream = false;
+            return _reader.fast_forward_to(pr);
+        }
+        virtual future<> fast_forward_to(position_range pr) override {
+            throw std::bad_function_call();
+        }
+    };
+    return make_flat_mutation_reader<transforming_reader>(std::move(r), std::move(t));
+}
+
 flat_mutation_reader flat_mutation_reader_from_mutation_reader(schema_ptr, mutation_reader&&, streamed_mutation::forwarding);

+flat_mutation_reader make_delegating_reader(flat_mutation_reader&);
+
 flat_mutation_reader make_forwardable(flat_mutation_reader m);

+flat_mutation_reader make_nonforwardable(flat_mutation_reader, bool);
+
 flat_mutation_reader make_empty_flat_reader(schema_ptr s);

 flat_mutation_reader flat_mutation_reader_from_mutations(std::vector<mutation>, streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);
@@ -291,5 +483,27 @@ flat_mutation_reader flat_mutation_reader_from_mutations(std::vector<mutation>,
 flat_mutation_reader
 make_flat_multi_range_reader(schema_ptr s, mutation_source source, const dht::partition_range_vector& ranges,
                             const query::partition_slice& slice, const io_priority_class& pc = default_priority_class(),
-                             tracing::trace_state_ptr trace_state = nullptr, streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
+                             tracing::trace_state_ptr trace_state = nullptr,
                             flat_mutation_reader::partition_range_forwarding fwd_mr = flat_mutation_reader::partition_range_forwarding::yes);
+
+// Calls the consumer for each element of the reader's stream until end of stream
+// is reached or the consumer requests iteration to stop by returning stop_iteration::yes.
+// The consumer should accept mutation as the argument and return stop_iteration.
+// The returned future<> resolves when consumption ends.
+template <typename Consumer>
+inline
+future<> consume_partitions(flat_mutation_reader& reader, Consumer consumer) {
+    static_assert(std::is_same<future<stop_iteration>, futurize_t<std::result_of_t<Consumer(mutation&&)>>>::value, "bad Consumer signature");
+    using futurator = futurize<std::result_of_t<Consumer(mutation&&)>>;
+
+    return do_with(std::move(consumer), [&reader] (Consumer& c) -> future<> {
+        return repeat([&reader, &c] () {
+            return read_mutation_from_flat_mutation_reader(reader).then([&c] (mutation_opt&& mo) -> future<stop_iteration> {
+                if (!mo) {
+                    return make_ready_future<stop_iteration>(stop_iteration::yes);
+                }
+                return futurator::apply(c, std::move(*mo));
+            });
+        });
+    });
+}
--- a/frozen_mutation.cc
+++ b/frozen_mutation.cc
@@ -149,9 +149,6 @@ stop_iteration streamed_mutation_freezer::consume(clustering_row&& cr) {
 }

 stop_iteration streamed_mutation_freezer::consume(range_tombstone&& rt) {
-    if (_reversed) {
-        rt.flip();
-    }
    _rts.apply(_schema, std::move(rt));
    return stop_iteration::no;
 }
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -461,7 +461,8 @@ future<> gossiper::apply_state_locally(std::map<inet_address, endpoint_state> ma
                    int local_generation = local_ep_state_ptr.get_heart_beat_state().get_generation();
                    int remote_generation = remote_state.get_heart_beat_state().get_generation();
                    logger.trace("{} local generation {}, remote generation {}", ep, local_generation, remote_generation);
-                    if (local_generation != 0 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
+                    // A node was removed with nodetool removenode can have a generation of 2
+                    if (local_generation > 2 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
                        // assume some peer has corrupted memory and is broadcasting an unbelievable generation about another peer (or itself)
                        logger.warn("received an invalid gossip generation for peer {}; local generation = {}, received generation = {}",
                            ep, local_generation, remote_generation);
@@ -832,6 +833,7 @@ int gossiper::get_max_endpoint_state_version(endpoint_state state) {

 // Runs inside seastar::async context
 void gossiper::evict_from_membership(inet_address endpoint) {
+    auto permit = lock_endpoint(endpoint).get0();
    _unreachable_endpoints.erase(endpoint);
    container().invoke_on_all([endpoint] (auto& g) {
        g.endpoint_state_map.erase(endpoint);
--- a/intrusive_set_external_comparator.hh
+++ b/intrusive_set_external_comparator.hh
@@ -105,6 +105,7 @@ private:
 public:
    intrusive_set_external_comparator() { algo::init_header(_header.this_ptr()); }
    intrusive_set_external_comparator(intrusive_set_external_comparator&& o) {
+        algo::init_header(_header.this_ptr());
        algo::swap_tree(_header.this_ptr(), node_ptr(o._header.this_ptr()));
    }
    iterator begin() { return iterator(algo::begin_node(_header.this_ptr()), priv_value_traits_ptr()); }
--- a/json.hh
+++ b/json.hh
@@ -22,34 +22,59 @@
 #pragma once

 #include "core/sstring.hh"
+#include "core/print.hh"

 #include <json/json.h>

 namespace seastar { // FIXME: not ours
 namespace json {

+inline sstring to_sstring(const Json::Value& value) {
+#if defined(JSONCPP_VERSION_HEXA) && (JSONCPP_VERSION_HEXA >= 0x010400) // >= 1.4.0
+    Json::StreamWriterBuilder wbuilder;
+    wbuilder.settings_["indentation"] = "";
+    auto str = Json::writeString(wbuilder, value);
+#else
+    Json::FastWriter writer;
+    // Json::FastWriter unnecessarily adds a newline at the end of string.
+    // There is a method omitEndingLineFeed() which prevents that, but it seems
+    // to be too recent addition, so, at least for now, a workaround is needed.
+    auto str = writer.write(value);
+    if (str.length() && str.back() == '\n') {
+        str.pop_back();
+    }
+#endif
+    return str;
+}
+
 template<typename Map>
 inline sstring to_json(const Map& map) {
    Json::Value root(Json::objectValue);
    for (auto&& kv : map) {
        root[kv.first] = Json::Value(kv.second);
    }
-    Json::FastWriter writer;
-    // Json::FastWriter unnecessarily adds a newline at the end of string.
-    // There is a method omitEndingLineFeed() which prevents that, but it seems
-    // to be too recent addition, so, at least for now, a workaround is needed.
-    auto str = writer.write(root);
-    if (str.length() && str.back() == '\n') {
-        str.pop_back();
+    return to_sstring(root);
+}
+
+inline Json::Value to_json_value(const sstring& raw) {
+    Json::Value root;
+#if defined(JSONCPP_VERSION_HEXA) && (JSONCPP_VERSION_HEXA >= 0x010400) // >= 1.4.0
+    Json::CharReaderBuilder rbuilder;
+    std::unique_ptr<Json::CharReader> reader(rbuilder.newCharReader());
+    bool result = reader->parse(raw.begin(), raw.end(), &root, NULL);
+    if (!result) {
+        throw std::runtime_error(sprint("Failed to parse JSON: %s", raw));
    }
-    return str;
+#else
+    Json::Reader reader;
+    reader.parse(std::string{raw}, root);
+#endif
+    return root;
 }

 template<typename Map>
 inline Map to_map(const sstring& raw, Map&& map) {
-    Json::Value root;
-    Json::Reader reader;
-    reader.parse(std::string{raw}, root);
+    Json::Value root = to_json_value(raw);
    for (auto&& member : root.getMemberNames()) {
        map.emplace(member, root[member].asString());
    }
--- a/locator/ec2_multi_region_snitch.cc
+++ b/locator/ec2_multi_region_snitch.cc
@@ -100,7 +100,6 @@ future<> ec2_multi_region_snitch::gossiper_starting() {
    // Note: currently gossiper "main" instance always runs on CPU0 therefore
    // this function will be executed on CPU0 only.
    //
-    ec2_snitch::gossiper_starting();

    using namespace gms;
    auto& g = get_local_gossiper();
--- a/main.cc
+++ b/main.cc
@@ -276,19 +276,25 @@ int main(int ac, char** av) {
    app_template app(std::move(app_cfg));

    auto cfg = make_lw_shared<db::config>();
-    bool help_version = false;
    auto init = app.get_options_description().add_options();

+    // If --version is requested, print it out and exit immediately to avoid
+    // Seastar-specific warnings that may occur when running the app
+    init("version", bpo::bool_switch(), "print version number and exit");
+    bpo::variables_map vm;
+    bpo::store(bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run(), vm);
+    if (vm["version"].as<bool>()) {
+        print("%s\n", scylla_version());
+        return 0;
+    }
+
+    // TODO : default, always read?
+    init("options-file", bpo::value<sstring>(), "configuration file (i.e. <SCYLLA_HOME>/conf/scylla.yaml)");
    cfg->add_options(init);
    for (configurable& c : configurables()) {
        c.append_options(init);
    }

-    init // TODO : default, always read?
-        ("options-file", bpo::value<sstring>(), "configuration file (i.e. <SCYLLA_HOME>/conf/scylla.yaml)")
-        ("version", bpo::bool_switch(&help_version), "print version number and exit")
-        ;
-
    distributed<database> db;
    seastar::sharded<service::cache_hitrate_calculator> cf_cache_hitrate_calculator;
    debug::db = &db;
@@ -301,11 +307,7 @@ int main(int ac, char** av) {
    directories dirs;

    return app.run_deprecated(ac, av, [&] {
-        if (help_version) {
-            print("%s\n", scylla_version());
-            engine().exit(0);
-            return make_ready_future<>();
-        }
+
        print("Scylla version %s starting ...\n", scylla_version());
        auto&& opts = app.configuration();

@@ -499,7 +501,7 @@ int main(int ac, char** av) {
            auto prio = get_or_default(ssl_opts, "priority_string", sstring());
            auto clauth = is_true(get_or_default(ssl_opts, "require_client_auth", "false"));
            if (cluster_name.empty()) {
-                cluster_name = "ScyllaDB Cluster";
+                cluster_name = "Test Cluster";
                startlog.warn("Using default cluster name is not recommended. Using a unique cluster name will reduce the chance of adding nodes to the wrong cluster by mistake");
            }
            init_ms_fd_gossiper(listen_address
--- a/memtable.cc
+++ b/memtable.cc
@@ -24,6 +24,7 @@
 #include "frozen_mutation.hh"
 #include "stdx.hh"
 #include "partition_snapshot_reader.hh"
+#include "schema_upgrader.hh"

 memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, memtable_list* memtable_list)
        : logalloc::region(dmm.region_group())
@@ -213,12 +214,14 @@ protected:
        }
    }

-    void advance() {
-        memtable_entry& e = *_i;
-        _last = e.key();
+    void advance_iterator() {
        ++_i;
    }

+    void update_last(dht::decorated_key last) {
+        _last = std::move(last);
+    }
+
    logalloc::allocating_section& read_section() {
        return _memtable->_read_section;
    }
@@ -309,20 +312,36 @@ public:
                if (_delegate_range) {
                    _delegate = delegate_reader(*_delegate_range, _slice, _pc, streamed_mutation::forwarding::no, _fwd_mr);
                } else {
-                    read_section()(region(), [&] {
-                        with_linearized_managed_bytes([&] {
+                    auto key_and_snp = read_section()(region(), [&] {
+                        return with_linearized_managed_bytes([&] () -> stdx::optional<std::pair<dht::decorated_key, lw_shared_ptr<partition_snapshot>>> {
                            memtable_entry *e = fetch_entry();
                            if (!e) {
-                                _end_of_stream = true;
+                                return { };
                            } else {
                                // FIXME: Introduce a memtable specific reader that will be returned from
                                // memtable_entry::read and will allow filling the buffer without the overhead of
                                // virtual calls, intermediate buffers and futures.
-                                _delegate = e->read(mtbl(), schema(), _slice, streamed_mutation::forwarding::no);
-                                advance();
+                                auto key = e->key();
+                                auto snp = e->snapshot(*mtbl());
+                                advance_iterator();
+                                return std::make_pair(std::move(key), std::move(snp));
                            }
                        });
                    });
+                    if (key_and_snp) {
+                        update_last(key_and_snp->first);
+                        auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), _slice, key_and_snp->first.key());
+                        auto snp_schema = key_and_snp->second->schema();
+                        auto mpsr = make_partition_snapshot_flat_reader(snp_schema, std::move(key_and_snp->first), std::move(cr),
+                                        std::move(key_and_snp->second), region(), read_section(), mtbl(), streamed_mutation::forwarding::no);
+                        if (snp_schema->version() != schema()->version()) {
+                            _delegate = transform(std::move(mpsr), schema_upgrader(schema()));
+                        } else {
+                            _delegate = std::move(mpsr);
+                        }
+                    } else {
+                        _end_of_stream = true;
+                    }
                }
            }

@@ -386,11 +405,9 @@ public:
    ~flush_memory_accounter() {
        assert(_mt._flushed_memory <= _mt.occupancy().used_space());
    }
-    void account_component(memtable_entry& e) {
-        update_bytes_read(e.size_in_allocator_without_rows(_mt.allocator()));
-    }
-    void account_component(partition_snapshot& snp) {
-        update_bytes_read(_mt.allocator().object_memory_size_in_allocator(&*snp.version()));
+    uint64_t compute_size(memtable_entry& e, partition_snapshot& snp) {
+        return e.size_in_allocator_without_rows(_mt.allocator())
+            + _mt.allocator().object_memory_size_in_allocator(&*snp.version());
    }
 };

@@ -428,53 +445,89 @@ public:
    }
 };

-class flush_reader final : public mutation_reader::impl, private iterator_reader {
+class flush_reader final : public flat_mutation_reader::impl, private iterator_reader {
+    // FIXME: Similarly to scanning_reader we have an underlying
+    // flat_mutation_reader for each partition. This is suboptimal.
+    // Partition snapshot reader should be devirtualised and called directly
+    // without using any intermediate buffers.
+    flat_mutation_reader_opt _partition_reader;
    flush_memory_accounter _flushed_memory;
 public:
    flush_reader(schema_ptr s, lw_shared_ptr<memtable> m)
-        : iterator_reader(std::move(s), m, query::full_partition_range)
+        : impl(s)
+        , iterator_reader(std::move(s), m, query::full_partition_range)
        , _flushed_memory(*m)
    {}
    flush_reader(const flush_reader&) = delete;
    flush_reader(flush_reader&&) = delete;
    flush_reader& operator=(flush_reader&&) = delete;
    flush_reader& operator=(const flush_reader&) = delete;
-
-    virtual future<streamed_mutation_opt> operator()() override {
-        return read_section()(region(), [&] {
-            return with_linearized_managed_bytes([&] {
+private:
+    void get_next_partition() {
+        uint64_t component_size = 0;
+        auto key_and_snp = read_section()(region(), [&] {
+            return with_linearized_managed_bytes([&] () -> stdx::optional<std::pair<dht::decorated_key, lw_shared_ptr<partition_snapshot>>> {
                memtable_entry* e = fetch_entry();
-                if (!e) {
-                    return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
-                } else {
-                    auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), schema()->full_slice(), e->key().key());
-                    auto snp = e->partition().read(region(), schema());
-                    auto mpsr = make_partition_snapshot_reader<partition_snapshot_accounter>(schema(), e->key(), std::move(cr),
-                            snp, region(), read_section(), mtbl(), streamed_mutation::forwarding::no, _flushed_memory);
-                    _flushed_memory.account_component(*e);
-                    _flushed_memory.account_component(*snp);
-                    auto ret = make_ready_future<streamed_mutation_opt>(std::move(mpsr));
-                    advance();
-                    return ret;
+                if (e) {
+                    auto dk = e->key();
+                    auto snp = e->snapshot(*mtbl());
+                    component_size = _flushed_memory.compute_size(*e, *snp);
+                    advance_iterator();
+                    return std::make_pair(std::move(dk), std::move(snp));
+                }
+                return { };
+            });
+        });
+        if (key_and_snp) {
+            _flushed_memory.update_bytes_read(component_size);
+            update_last(key_and_snp->first);
+            auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), schema()->full_slice(), key_and_snp->first.key());
+            auto snp_schema = key_and_snp->second->schema();
+            auto mpsr = make_partition_snapshot_flat_reader<partition_snapshot_accounter>(snp_schema, std::move(key_and_snp->first), std::move(cr),
+                            std::move(key_and_snp->second), region(), read_section(), mtbl(), streamed_mutation::forwarding::no, _flushed_memory);
+            if (snp_schema->version() != schema()->version()) {
+                _partition_reader = transform(std::move(mpsr), schema_upgrader(schema()));
+            } else {
+                _partition_reader = std::move(mpsr);
+            }
+        }
+    }
+public:
+    virtual future<> fill_buffer() override {
+        return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
+            if (!_partition_reader) {
+                get_next_partition();
+                if (!_partition_reader) {
+                    _end_of_stream = true;
+                    return make_ready_future<>();
+                }
+            }
+            return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
+                push_mutation_fragment(std::move(mf));
+                return stop_iteration(is_buffer_full());
+            }).then([this] {
+                if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
+                    _partition_reader = stdx::nullopt;
                }
            });
        });
    }
-    virtual future<> fast_forward_to(const dht::partition_range& pr) override {
-        return iterator_reader::fast_forward_to(pr);
+    virtual void next_partition() override {
+        clear_buffer_to_next_partition();
+        if (is_buffer_empty()) {
+            _partition_reader = stdx::nullopt;
+        }
+    }
+    virtual future<> fast_forward_to(const dht::partition_range&) override {
+        throw std::bad_function_call();
+    }
+    virtual future<> fast_forward_to(position_range) override {
+        throw std::bad_function_call();
    }
 };

-mutation_reader
-memtable::make_reader(schema_ptr s,
-                      const dht::partition_range& range,
-                      const query::partition_slice& slice,
-                      const io_priority_class& pc,
-                      tracing::trace_state_ptr trace_state_ptr,
-                      streamed_mutation::forwarding fwd,
-                      mutation_reader::forwarding fwd_mr) {
-    return mutation_reader_from_flat_mutation_reader(
-            make_flat_reader(std::move(s), range, slice, pc, std::move(trace_state_ptr), fwd, fwd_mr));
+lw_shared_ptr<partition_snapshot> memtable_entry::snapshot(memtable& mtbl) {
+    return _pe.read(mtbl.region(), _schema);
 }

 flat_mutation_reader
@@ -487,16 +540,28 @@ memtable::make_flat_reader(schema_ptr s,
                      mutation_reader::forwarding fwd_mr) {
    if (query::is_single_partition(range)) {
        const query::ring_position& pos = range.start()->value();
-        return _read_section(*this, [&] {
-        managed_bytes::linearization_context_guard lcg;
-        auto i = partitions.find(pos, memtable_entry::compare(_schema));
-        if (i != partitions.end()) {
-            upgrade_entry(*i);
-            return i->read(shared_from_this(), s, slice, fwd);
-        } else {
+        auto snp = _read_section(*this, [&] () -> lw_shared_ptr<partition_snapshot> {
+            managed_bytes::linearization_context_guard lcg;
+            auto i = partitions.find(pos, memtable_entry::compare(_schema));
+            if (i != partitions.end()) {
+                upgrade_entry(*i);
+                return i->snapshot(*this);
+            } else {
+                return { };
+            }
+        });
+        if (!snp) {
            return make_empty_flat_reader(std::move(s));
        }
-        });
+        auto dk = pos.as_decorated_key();
+        auto cr = query::clustering_key_filter_ranges::get_ranges(*s, slice, dk.key());
+        auto snp_schema = snp->schema();
+        auto rd = make_partition_snapshot_flat_reader(snp_schema, std::move(dk), std::move(cr), std::move(snp), *this, _read_section, shared_from_this(), fwd);
+        if (snp_schema->version() != s->version()) {
+            return transform(std::move(rd), schema_upgrader(s));
+        } else {
+            return rd;
+        }
    } else {
        auto res = make_flat_mutation_reader<scanning_reader>(std::move(s), shared_from_this(), range, slice, pc, fwd_mr);
        if (fwd == streamed_mutation::forwarding::yes) {
@@ -507,14 +572,14 @@ memtable::make_flat_reader(schema_ptr s,
    }
 }

-mutation_reader
+flat_mutation_reader
 memtable::make_flush_reader(schema_ptr s, const io_priority_class& pc) {
    if (group()) {
-        return make_mutation_reader<flush_reader>(std::move(s), shared_from_this());
+        return make_flat_mutation_reader<flush_reader>(s, shared_from_this());
    } else {
        auto& full_slice = s->full_slice();
-        return mutation_reader_from_flat_mutation_reader(make_flat_mutation_reader<scanning_reader>(std::move(s), shared_from_this(),
-            query::full_partition_range, full_slice, pc, mutation_reader::forwarding::no));
+        return make_flat_mutation_reader<scanning_reader>(std::move(s), shared_from_this(),
+            query::full_partition_range, full_slice, pc, mutation_reader::forwarding::no);
    }
 }

@@ -529,8 +594,8 @@ memtable::update(db::rp_handle&& h) {

 future<>
 memtable::apply(memtable& mt) {
-    return do_with(mt.make_reader(_schema), [this] (auto&& rd) mutable {
-        return consume(rd, [self = this->shared_from_this(), &rd] (mutation&& m) {
+    return do_with(mt.make_flat_reader(_schema), [this] (auto&& rd) mutable {
+        return consume_partitions(rd, [self = this->shared_from_this(), &rd] (mutation&& m) {
            self->apply(m);
            return stop_iteration::no;
        });
@@ -602,21 +667,6 @@ bool memtable::is_flushed() const {
    return bool(_underlying);
 }

-flat_mutation_reader
-memtable_entry::read(lw_shared_ptr<memtable> mtbl,
-        const schema_ptr& target_schema,
-        const query::partition_slice& slice,
-        streamed_mutation::forwarding fwd) {
-    auto cr = query::clustering_key_filter_ranges::get_ranges(*_schema, slice, _key.key());
-    if (_schema->version() != target_schema->version()) {
-        auto mp = mutation_partition(_pe.squashed(_schema, target_schema), *target_schema, std::move(cr));
-        mutation m = mutation(target_schema, _key, std::move(mp));
-        return flat_mutation_reader_from_mutations({std::move(m)}, fwd);
-    }
-    auto snp = _pe.read(mtbl->region(), _schema);
-    return make_partition_snapshot_flat_reader(_schema, _key, std::move(cr), snp, *mtbl, mtbl->_read_section, mtbl, fwd);
-}
-
 void memtable::upgrade_entry(memtable_entry& e) {
    if (e._schema != _schema) {
        assert(!reclaiming_enabled());
@@ -632,3 +682,12 @@ void memtable::upgrade_entry(memtable_entry& e) {
 void memtable::set_schema(schema_ptr new_schema) noexcept {
    _schema = std::move(new_schema);
 }
+
+std::ostream& operator<<(std::ostream& out, memtable& mt) {
+    logalloc::reclaim_lock rl(mt);
+    return out << "{memtable: [" << ::join(",\n", mt.partitions) << "]}";
+}
+
+std::ostream& operator<<(std::ostream& out, const memtable_entry& mt) {
+    return out << "{" << mt.key() << ": " << mt.partition() << "}";
+}
--- a/memtable.hh
+++ b/memtable.hh
@@ -23,6 +23,7 @@

 #include <map>
 #include <memory>
+#include <iosfwd>
 #include "database_fwd.hh"
 #include "dht/i_partitioner.hh"
 #include "schema.hh"
@@ -60,7 +61,7 @@ public:
    partition_entry& partition() { return _pe; }
    const schema_ptr& schema() const { return _schema; }
    schema_ptr& schema() { return _schema; }
-    flat_mutation_reader read(lw_shared_ptr<memtable> mtbl, const schema_ptr&, const query::partition_slice&, streamed_mutation::forwarding);
+    lw_shared_ptr<partition_snapshot> snapshot(memtable& mtbl);

    size_t external_memory_usage_without_rows() const {
        return _key.key().external_memory_usage();
@@ -105,6 +106,8 @@ public:
            return _c(k1, k2._key);
        }
    };
+
+    friend std::ostream& operator<<(std::ostream&, const memtable_entry&);
 };

 class dirty_memory_manager;
@@ -194,19 +197,6 @@ public:
    // The 'range' parameter must be live as long as the reader is being used
    //
    // Mutations returned by the reader will all have given schema.
-    mutation_reader make_reader(schema_ptr,
-                                const dht::partition_range& range,
-                                const query::partition_slice& slice,
-                                const io_priority_class& pc = default_priority_class(),
-                                tracing::trace_state_ptr trace_state_ptr = nullptr,
-                                streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
-                                mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes);
-
-    mutation_reader make_reader(schema_ptr s, const dht::partition_range& range = query::full_partition_range) {
-        auto& full_slice = s->full_slice();
-        return make_reader(s, range, full_slice);
-    }
-
    flat_mutation_reader make_flat_reader(schema_ptr,
                                          const dht::partition_range& range,
                                          const query::partition_slice& slice,
@@ -221,7 +211,7 @@ public:
        return make_flat_reader(s, range, full_slice);
    }

-    mutation_reader make_flush_reader(schema_ptr, const io_priority_class& pc);
+    flat_mutation_reader make_flush_reader(schema_ptr, const io_priority_class& pc);

    mutation_source as_data_source();

@@ -242,4 +232,6 @@ public:
    dirty_memory_manager& get_dirty_memory_manager() {
        return _dirty_mgr;
    }
+
+    friend std::ostream& operator<<(std::ostream&, memtable&);
 };
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -514,7 +514,6 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }();

    auto remote_addr = ipv4_addr(get_preferred_ip(id.addr).raw_addr(), must_encrypt ? _ssl_port : _port);
-    auto local_addr = ipv4_addr{_listen_address.raw_addr(), 0};

    rpc::client_options opts;
    // send keepalive messages each minute if connection is idle, drop connection after 10 failures
@@ -526,9 +525,9 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge

    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
-                                    remote_addr, local_addr, _credentials) :
+                                    remote_addr, ipv4_addr(), _credentials) :
                    ::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
-                                    remote_addr, local_addr);
+                                    remote_addr);

    it = _clients[idx].emplace(id, shard_info(std::move(client))).first;
    uint32_t src_cpu_id = engine().cpu_id();
--- a/mutation.cc
+++ b/mutation.cc
@@ -212,9 +212,7 @@ mutation& mutation::operator+=(mutation&& other) {
 }

 mutation mutation::sliced(const query::clustering_row_ranges& ranges) const {
-    auto m = mutation(schema(), decorated_key(), mutation_partition(partition(), *schema(), ranges));
-    m.partition().row_tombstones().trim(*schema(), ranges);
-    return m;
+    return mutation(schema(), decorated_key(), partition().sliced(*schema(), ranges));
 }

 class mutation_rebuilder {
@@ -269,13 +267,13 @@ future<mutation> mutation_from_streamed_mutation(streamed_mutation& sm) {
    });
 }

-future<mutation_opt> read_mutation_from_flat_mutation_reader(schema_ptr s, flat_mutation_reader& r) {
+future<mutation_opt> read_mutation_from_flat_mutation_reader(flat_mutation_reader& r) {
    if (r.is_buffer_empty()) {
        if (r.is_end_of_stream()) {
            return make_ready_future<mutation_opt>();
        }
-        return r.fill_buffer().then([&r, s = std::move(s)] {
-            return read_mutation_from_flat_mutation_reader(std::move(s), r);
+        return r.fill_buffer().then([&r] {
+            return read_mutation_from_flat_mutation_reader(r);
        });
    }
    // r.is_buffer_empty() is always false at this point
@@ -320,5 +318,12 @@ future<mutation_opt> read_mutation_from_flat_mutation_reader(schema_ptr s, flat_
            return _builder->consume_end_of_stream();
        }
    };
-    return r.consume(adapter(std::move(s)));
-}
+    return r.consume(adapter(r.schema()));
+}
+
+std::ostream& operator<<(std::ostream& os, const mutation& m) {
+    const ::schema& s = *m.schema();
+    fprint(os, "{%s.%s key %s data ", s.ks_name(), s.cf_name(), m.decorated_key());
+    os << m.partition() << "}";
+    return os;
+}
--- a/mutation.hh
+++ b/mutation.hh
@@ -194,4 +194,4 @@ future<mutation> mutation_from_streamed_mutation(streamed_mutation& sm);
 class flat_mutation_reader;

 // Reads a single partition from a reader. Returns empty optional if there are no more partitions to be read.
-future<mutation_opt> read_mutation_from_flat_mutation_reader(schema_ptr, flat_mutation_reader&);
+future<mutation_opt> read_mutation_from_flat_mutation_reader(flat_mutation_reader&);
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -859,6 +859,11 @@ mutation_partition::query_compacted(query::result::partition_writer& pw, const s
    }
 }

+std::ostream&
+operator<<(std::ostream& out, const atomic_cell_or_collection& c) {
+    return out << to_hex(c._data);
+}
+
 std::ostream&
 operator<<(std::ostream& os, const std::pair<column_id, const atomic_cell_or_collection&>& c) {
    return fprint(os, "{column: %s %s}", c.first, c.second);
@@ -1014,6 +1019,12 @@ bool mutation_partition::equal_continuity(const schema& s, const mutation_partit
        });
 }

+mutation_partition mutation_partition::sliced(const schema& s, const query::clustering_row_ranges& ranges) const {
+    auto p = mutation_partition(*this, s, ranges);
+    p.row_tombstones().trim(s, ranges);
+    return p;
+}
+
 void
 apply_reversibly(const column_definition& def, atomic_cell_or_collection& dst,  atomic_cell_or_collection& src) {
    // Must be run via with_linearized_managed_bytes() context, but assume it is
@@ -1997,8 +2008,11 @@ future<> data_query(
    auto cfq = make_stable_flattened_mutations_consumer<compact_for_query<emit_only_live_rows::yes, query_result_builder>>(
            *s, query_time, slice, row_limit, partition_limit, std::move(qrb));

-    auto reader = source(s, range, slice, service::get_local_sstable_query_read_priority(), std::move(trace_ptr));
-    return consume_flattened(std::move(reader), std::move(cfq), is_reversed);
+    return do_with(source.make_flat_mutation_reader(s, range, slice, service::get_local_sstable_query_read_priority(), std::move(trace_ptr),
+                                                    streamed_mutation::forwarding::no, mutation_reader::forwarding::no),
+                   [cfq = std::move(cfq), is_reversed] (flat_mutation_reader& reader) mutable {
+        return reader.consume(std::move(cfq), flat_mutation_reader::consume_reversed_partitions(is_reversed));
+    });
 }

 class reconcilable_result_builder {
@@ -2101,8 +2115,11 @@ static do_mutation_query(schema_ptr s,
    auto cfq = make_stable_flattened_mutations_consumer<compact_for_query<emit_only_live_rows::no, reconcilable_result_builder>>(
            *s, query_time, slice, row_limit, partition_limit, std::move(rrb));

-    auto reader = source(s, range, slice, service::get_local_sstable_query_read_priority(), std::move(trace_ptr));
-    return consume_flattened(std::move(reader), std::move(cfq), is_reversed);
+    return do_with(source.make_flat_mutation_reader(s, range, slice, service::get_local_sstable_query_read_priority(), std::move(trace_ptr),
+                                                    streamed_mutation::forwarding::no, mutation_reader::forwarding::no),
+                   [cfq = std::move(cfq), is_reversed] (flat_mutation_reader& reader) mutable {
+        return reader.consume(std::move(cfq), flat_mutation_reader::consume_reversed_partitions(is_reversed));
+    });
 }

 static thread_local auto mutation_query_stage = seastar::make_execution_stage("mutation_query", do_mutation_query);
@@ -2209,7 +2226,7 @@ void mutation_partition::evict() noexcept {
 }

 bool
-mutation_partition::check_continuity(const schema& s, const position_range& r, is_continuous cont) {
+mutation_partition::check_continuity(const schema& s, const position_range& r, is_continuous cont) const {
    auto less = rows_entry::compare(s);
    auto i = _rows.lower_bound(r.start(), less);
    auto end = _rows.lower_bound(r.end(), less);
@@ -2248,12 +2265,29 @@ future<mutation_opt> counter_write_query(schema_ptr s, const mutation_source& so
                                         const query::partition_slice& slice,
                                         tracing::trace_state_ptr trace_ptr)
 {
-    return do_with(dht::partition_range::make_singular(dk), [&] (auto& prange) {
-        auto cwqrb = counter_write_query_result_builder(*s);
-        auto cfq = make_stable_flattened_mutations_consumer<compact_for_query<emit_only_live_rows::yes, counter_write_query_result_builder>>(
-                *s, gc_clock::now(), slice, query::max_rows, query::max_rows, std::move(cwqrb));
-        auto reader = source(s, prange, slice,
-                             service::get_local_sstable_query_read_priority(), std::move(trace_ptr));
-        return consume_flattened(std::move(reader), std::move(cfq), false);
-    });
+    struct range_and_reader {
+        dht::partition_range range;
+        flat_mutation_reader reader;
+
+        range_and_reader(range_and_reader&&) = delete;
+        range_and_reader(const range_and_reader&) = delete;
+
+        range_and_reader(schema_ptr s, const mutation_source& source,
+                         const dht::decorated_key& dk,
+                         const query::partition_slice& slice,
+                         tracing::trace_state_ptr trace_ptr)
+            : range(dht::partition_range::make_singular(dk))
+            , reader(source.make_flat_mutation_reader(s, range, slice, service::get_local_sstable_query_read_priority(),
+                                                      std::move(trace_ptr), streamed_mutation::forwarding::no,
+                                                      mutation_reader::forwarding::no))
+        { }
+    };
+
+    // do_with() doesn't support immovable objects
+    auto r_a_r = std::make_unique<range_and_reader>(s, source, dk, slice, std::move(trace_ptr));
+    auto cwqrb = counter_write_query_result_builder(*s);
+    auto cfq = make_stable_flattened_mutations_consumer<compact_for_query<emit_only_live_rows::yes, counter_write_query_result_builder>>(
+            *s, gc_clock::now(), slice, query::max_rows, query::max_rows, std::move(cwqrb));
+    auto f = r_a_r->reader.consume(std::move(cfq), flat_mutation_reader::consume_reversed_partitions::no);
+    return f.finally([r_a_r = std::move(r_a_r)] { });
 }
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -46,6 +46,7 @@
 #include "utils/with_relational_operators.hh"

 class mutation_fragment;
+class clustering_row;

 //
 // Container for cells of a row. Cells are identified by column_id.
@@ -853,8 +854,6 @@ private:

    friend class mutation_partition_applier;
    friend class converting_mutation_partition_applier;
-
-    bool check_continuity(const schema&, const position_range&, is_continuous);
 public:
    struct copy_comparators_only {};
    struct incomplete_tag {};
@@ -900,6 +899,8 @@ public:
    bool fully_continuous(const schema&, const position_range&);
    // Returns true iff all keys from given range are marked as not continuous and range is not empty.
    bool fully_discontinuous(const schema&, const position_range&);
+    // Returns true iff all keys from given range have continuity membership as specified by is_continuous.
+    bool check_continuity(const schema&, const position_range&, is_continuous) const;
    // Removes all data, marking affected ranges as discontinuous.
    void evict() noexcept;
    // Applies mutation_fragment.
@@ -996,6 +997,10 @@ public:
    // This and other must both be governed by the same schema s.
    mutation_partition difference(schema_ptr s, const mutation_partition& other) const;

+    // Returns a subset of this mutation holding only information relevant for given clustering ranges.
+    // Range tombstones will be trimmed to the boundaries of the clustering ranges.
+    mutation_partition sliced(const schema& s, const query::clustering_row_ranges&) const;
+
    // Returns true if there is no live data or tombstones.
    bool empty() const;
 public:
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -27,18 +27,250 @@
 #include "core/future-util.hh"
 #include "utils/move.hh"
 #include "stdx.hh"
-#include "reader_resource_tracker.hh"
 #include "flat_mutation_reader.hh"

+
+GCC6_CONCEPT(
+    template<typename Producer>
+    concept bool FragmentProducer = requires(Producer p, dht::partition_range part_range, position_range pos_range) {
+        // The returned fragments are expected to have the same
+        // position_in_partition. Iterators and references are expected
+        // to be valid until the next call to operator()().
+        { p() } -> future<boost::iterator_range<std::vector<mutation_fragment>::iterator>>;
+        // These have the same semantics as their
+        // flat_mutation_reader counterparts.
+        { p.next_partition() };
+        { p.fast_forward_to(part_range) } -> future<>;
+        { p.fast_forward_to(pos_range) } -> future<>;
+    };
+)
+
+/**
+ * Merge mutation-fragments produced by producer.
+ *
+ * Merge a non-decreasing stream of mutation-fragments into strictly
+ * increasing stream. The merger is stateful, it's intended to be kept
+ * around *at least* for merging an entire partition. That is, creating
+ * a new instance for each batch of fragments will produce incorrect
+ * results.
+ *
+ * Call operator() to get the next mutation fragment. operator() will
+ * consume fragments from the producer using operator().
+ * Any fast-forwarding has to be communicated to the merger object using
+ * fast_forward_to() and next_partition(), as appropriate.
+ */
+template<class Producer>
+GCC6_CONCEPT(
+    requires FragmentProducer<Producer>
+)
+class mutation_fragment_merger {
+    using iterator = std::vector<mutation_fragment>::iterator;
+
+    const schema_ptr _schema;
+    Producer _producer;
+    range_tombstone_stream _deferred_tombstones;
+    iterator _it;
+    iterator _end;
+    bool _end_of_stream = false;
+
+    void apply(mutation_fragment& to, mutation_fragment&& frag) {
+        if (to.is_range_tombstone()) {
+            if (auto remainder = to.as_mutable_range_tombstone().apply(*_schema, std::move(frag).as_range_tombstone())) {
+                _deferred_tombstones.apply(std::move(*remainder));
+            }
+        } else {
+            to.apply(*_schema, std::move(frag));
+        }
+    }
+
+    future<> fetch() {
+        if (!empty()) {
+            return make_ready_future<>();
+        }
+
+        return _producer().then([this] (boost::iterator_range<iterator> fragments) {
+            _it = fragments.begin();
+            _end = fragments.end();
+            if (empty()) {
+                _end_of_stream = true;
+            }
+        });
+    }
+
+    bool empty() const {
+        return _it == _end;
+    }
+
+    const mutation_fragment& top() const {
+        return *_it;
+    }
+
+    mutation_fragment pop() {
+        return std::move(*_it++);
+    }
+
+public:
+    mutation_fragment_merger(schema_ptr schema, Producer&& producer)
+        : _schema(std::move(schema))
+        , _producer(std::move(producer))
+        , _deferred_tombstones(*_schema) {
+    }
+
+    future<mutation_fragment_opt> operator()() {
+        if (_end_of_stream) {
+            return make_ready_future<mutation_fragment_opt>(_deferred_tombstones.get_next());
+        }
+
+        return fetch().then([this] () -> mutation_fragment_opt {
+            if (empty()) {
+                return _deferred_tombstones.get_next();
+            }
+
+            auto current = [&] {
+                if (auto rt = _deferred_tombstones.get_next(top())) {
+                    return std::move(*rt);
+                }
+                return pop();
+            }();
+
+            const auto equal = position_in_partition::equal_compare(*_schema);
+
+            // Position of current is always either < or == than those
+            // of the batch. In the former case there is nothing further
+            // to do.
+            if (empty() || !equal(current.position(), top().position())) {
+                return current;
+            }
+            while (!empty()) {
+                apply(current, pop());
+            }
+            return current;
+        });
+    }
+
+    void next_partition() {
+        _deferred_tombstones.reset();
+        _end_of_stream = false;
+        _producer.next_partition();
+    }
+
+    future<> fast_forward_to(const dht::partition_range& pr) {
+        _deferred_tombstones.reset();
+        _end_of_stream = false;
+        return _producer.fast_forward_to(pr);
+    }
+
+    future<> fast_forward_to(position_range pr) {
+        _deferred_tombstones.forward_to(pr.start());
+        _end_of_stream = false;
+        return _producer.fast_forward_to(std::move(pr));
+    }
+};
+
+// Merges the output of the sub-readers into a single non-decreasing
+// stream of mutation-fragments.
+class mutation_reader_merger {
+public:
+    struct reader_and_fragment {
+        flat_mutation_reader* reader;
+        mutation_fragment fragment;
+
+        reader_and_fragment(flat_mutation_reader* r, mutation_fragment f)
+            : reader(r)
+            , fragment(std::move(f)) {
+        }
+    };
+
+    struct reader_and_last_fragment_kind {
+        flat_mutation_reader* reader = nullptr;
+        mutation_fragment::kind last_kind = mutation_fragment::kind::partition_end;
+
+        reader_and_last_fragment_kind() = default;
+
+        reader_and_last_fragment_kind(flat_mutation_reader* r, mutation_fragment::kind k)
+            : reader(r)
+            , last_kind(k) {
+        }
+    };
+
+    using mutation_fragment_batch = boost::iterator_range<std::vector<mutation_fragment>::iterator>;
+private:
+    struct reader_heap_compare;
+    struct fragment_heap_compare;
+
+    std::unique_ptr<reader_selector> _selector;
+    // We need a list because we need stable addresses across additions
+    // and removals.
+    std::list<flat_mutation_reader> _all_readers;
+    // Readers positioned at a partition, different from the one we are
+    // reading from now. For these readers the attached fragment is
+    // always partition_start. Used to pick the next partition.
+    std::vector<reader_and_fragment> _reader_heap;
+    // Readers and their current fragments, belonging to the current
+    // partition.
+    std::vector<reader_and_fragment> _fragment_heap;
+    std::vector<reader_and_last_fragment_kind> _next;
+    // Readers that reached EOS.
+    std::vector<reader_and_last_fragment_kind> _halted_readers;
+    std::vector<mutation_fragment> _current;
+    // Optimisation for cases where only a single reader emits a particular
+    // partition. If _single_reader.reader is not null that reader is
+    // guaranteed to be the only one having relevant data until the partition
+    // end, a call to next_partition() or a call to
+    // fast_forward_to(dht::partition_range).
+    reader_and_last_fragment_kind _single_reader;
+    dht::decorated_key_opt _key;
+    const schema_ptr _schema;
+    streamed_mutation::forwarding _fwd_sm;
+    mutation_reader::forwarding _fwd_mr;
+private:
+    const dht::token* current_position() const;
+    void maybe_add_readers(const dht::token* const t);
+    void add_readers(std::vector<flat_mutation_reader> new_readers);
+    future<> prepare_next();
+    // Collect all forwardable readers into _next, and remove them from
+    // their previous containers (_halted_readers and _fragment_heap).
+    void prepare_forwardable_readers();
+public:
+    mutation_reader_merger(schema_ptr schema,
+            std::unique_ptr<reader_selector> selector,
+            streamed_mutation::forwarding fwd_sm,
+            mutation_reader::forwarding fwd_mr);
+    // Produces the next batch of mutation-fragments of the same
+    // position.
+    future<mutation_fragment_batch> operator()();
+    void next_partition();
+    future<> fast_forward_to(const dht::partition_range& pr);
+    future<> fast_forward_to(position_range pr);
+};
+
+// Combines multiple mutation_readers into one.
+class combined_mutation_reader : public flat_mutation_reader::impl {
+    mutation_fragment_merger<mutation_reader_merger> _producer;
+    streamed_mutation::forwarding _fwd_sm;
+public:
+    // The specified streamed_mutation::forwarding and
+    // mutation_reader::forwarding tag must be the same for all included
+    // readers.
+    combined_mutation_reader(schema_ptr schema,
+            std::unique_ptr<reader_selector> selector,
+            streamed_mutation::forwarding fwd_sm,
+            mutation_reader::forwarding fwd_mr);
+    virtual future<> fill_buffer() override;
+    virtual void next_partition() override;
+    virtual future<> fast_forward_to(const dht::partition_range& pr) override;
+    virtual future<> fast_forward_to(position_range pr) override;
+};
+
 // Dumb selector implementation for combined_mutation_reader that simply
 // forwards it's list of readers.
 class list_reader_selector : public reader_selector {
-    std::vector<mutation_reader> _readers;
+    std::vector<flat_mutation_reader> _readers;

 public:
-    explicit list_reader_selector(std::vector<mutation_reader> readers)
-        : _readers(std::move(readers)) {
-        _selector_position = dht::minimum_token();
+    explicit list_reader_selector(schema_ptr s, std::vector<flat_mutation_reader> readers)
+        : reader_selector(s, dht::ring_position::min())
+        , _readers(std::move(readers)) {
    }

    list_reader_selector(const list_reader_selector&) = delete;
@@ -47,17 +279,17 @@ public:
    list_reader_selector(list_reader_selector&&) = default;
    list_reader_selector& operator=(list_reader_selector&&) = default;

-    virtual std::vector<mutation_reader> create_new_readers(const dht::token* const) override {
-        _selector_position = dht::maximum_token();
+    virtual std::vector<flat_mutation_reader> create_new_readers(const dht::token* const) override {
+        _selector_position = dht::ring_position::max();
        return std::exchange(_readers, {});
    }

-    virtual std::vector<mutation_reader> fast_forward_to(const dht::partition_range&) override {
+    virtual std::vector<flat_mutation_reader> fast_forward_to(const dht::partition_range&) override {
        return {};
    }
 };

-void combined_mutation_reader::maybe_add_readers(const dht::token* const t) {
+void mutation_reader_merger::maybe_add_readers(const dht::token* const t) {
    if (!_selector->has_new_readers(t)) {
        return;
    }
@@ -65,103 +297,330 @@ void combined_mutation_reader::maybe_add_readers(const dht::token* const t) {
    add_readers(_selector->create_new_readers(t));
 }

-void combined_mutation_reader::add_readers(std::vector<mutation_reader> new_readers) {
+void mutation_reader_merger::add_readers(std::vector<flat_mutation_reader> new_readers) {
    for (auto&& new_reader : new_readers) {
        _all_readers.emplace_back(std::move(new_reader));
        auto* r = &_all_readers.back();
-        _next.emplace_back(r);
+        _next.emplace_back(r, mutation_fragment::kind::partition_end);
    }
 }

-const dht::token* combined_mutation_reader::current_position() const {
-    if (_ptables.empty()) {
+const dht::token* mutation_reader_merger::current_position() const {
+    if (!_key) {
        return nullptr;
    }

-    return &_ptables.front().m.decorated_key().token();
+    return &_key->token();
 }

-future<> combined_mutation_reader::prepare_next() {
-    maybe_add_readers(current_position());
+struct mutation_reader_merger::reader_heap_compare {
+    const schema& s;

-    return parallel_for_each(_next, [this] (mutation_reader* mr) {
-        return (*mr)().then([this, mr] (streamed_mutation_opt next) {
-            if (next) {
-                _ptables.emplace_back(mutation_and_reader { std::move(*next), mr });
-                boost::range::push_heap(_ptables, &heap_compare);
+    explicit reader_heap_compare(const schema& s)
+        : s(s) {
+    }
+
+    bool operator()(const mutation_reader_merger::reader_and_fragment& a, const mutation_reader_merger::reader_and_fragment& b) {
+        // Invert comparison as this is a max-heap.
+        return b.fragment.as_partition_start().key().less_compare(s, a.fragment.as_partition_start().key());
+    }
+};
+
+struct mutation_reader_merger::fragment_heap_compare {
+    position_in_partition::less_compare cmp;
+
+    explicit fragment_heap_compare(const schema& s)
+        : cmp(s) {
+    }
+
+    bool operator()(const mutation_reader_merger::reader_and_fragment& a, const mutation_reader_merger::reader_and_fragment& b) {
+        // Invert comparison as this is a max-heap.
+        return cmp(b.fragment.position(), a.fragment.position());
+    }
+};
+
+future<> mutation_reader_merger::prepare_next() {
+    return parallel_for_each(_next, [this] (reader_and_last_fragment_kind rk) {
+        return (*rk.reader)().then([this, rk] (mutation_fragment_opt mfo) {
+            if (mfo) {
+                if (mfo->is_partition_start()) {
+                    _reader_heap.emplace_back(rk.reader, std::move(*mfo));
+                    boost::push_heap(_reader_heap, reader_heap_compare(*_schema));
+                } else {
+                    _fragment_heap.emplace_back(rk.reader, std::move(*mfo));
+                    boost::range::push_heap(_fragment_heap, fragment_heap_compare(*_schema));
+                }
+            } else if (_fwd_sm == streamed_mutation::forwarding::yes && rk.last_kind != mutation_fragment::kind::partition_end) {
+                // When in streamed_mutation::forwarding mode we need
+                // to keep track of readers that returned
+                // end-of-stream to know what readers to ff. We can't
+                // just ff all readers as we might drop fragments from
+                // partitions we haven't even read yet.
+                // Readers whoose last emitted fragment was a partition
+                // end are out of data for good for the current range.
+                _halted_readers.push_back(rk);
            } else if (_fwd_mr == mutation_reader::forwarding::no) {
-                _all_readers.remove_if([mr] (auto& r) { return &r == mr; });
+                _all_readers.remove_if([mr = rk.reader] (auto& r) { return &r == mr; });
            }
        });
    }).then([this] {
        _next.clear();
+
+        // We are either crossing partition boundary or ran out of
+        // readers. If there are halted readers then we are just
+        // waiting for a fast-forward so there is nothing to do.
+        if (_fragment_heap.empty() && _halted_readers.empty()) {
+            if (_reader_heap.empty()) {
+                _key = {};
+            } else {
+                _key = _reader_heap.front().fragment.as_partition_start().key();
+            }
+
+            maybe_add_readers(current_position());
+        }
    });
 }

-future<streamed_mutation_opt> combined_mutation_reader::next() {
-    if ((_current.empty() && !_next.empty()) || _selector->has_new_readers(current_position())) {
-        return prepare_next().then([this] { return next(); });
+void mutation_reader_merger::prepare_forwardable_readers() {
+    _next.reserve(_halted_readers.size() + _fragment_heap.size() + _next.size());
+
+    std::move(_halted_readers.begin(), _halted_readers.end(), std::back_inserter(_next));
+    if (_single_reader.reader) {
+        _next.emplace_back(std::exchange(_single_reader.reader, {}), _single_reader.last_kind);
    }
-    if (_ptables.empty()) {
-        return make_ready_future<streamed_mutation_opt>();
+    for (auto& df : _fragment_heap) {
+        _next.emplace_back(df.reader, df.fragment.mutation_fragment_kind());
    }

-    while (!_ptables.empty()) {
-        boost::range::pop_heap(_ptables, &heap_compare);
-        auto& candidate = _ptables.back();
-        streamed_mutation& m = candidate.m;
+    _halted_readers.clear();
+    _fragment_heap.clear();
+}

-        _current.emplace_back(std::move(m));
-        _next.emplace_back(candidate.read);
-        _ptables.pop_back();
+mutation_reader_merger::mutation_reader_merger(schema_ptr schema,
+        std::unique_ptr<reader_selector> selector,
+        streamed_mutation::forwarding fwd_sm,
+        mutation_reader::forwarding fwd_mr)
+    : _selector(std::move(selector))
+    , _schema(std::move(schema))
+    , _fwd_sm(fwd_sm)
+    , _fwd_mr(fwd_mr) {
+    maybe_add_readers(nullptr);
+}

-        if (_ptables.empty() || !_current.back().decorated_key().equal(*_current.back().schema(), _ptables.front().m.decorated_key())) {
-            // key has changed, so emit accumulated mutation
-            break;
+future<mutation_reader_merger::mutation_fragment_batch> mutation_reader_merger::operator()() {
+    // Avoid merging-related logic if we know that only a single reader owns
+    // current partition.
+    if (_single_reader.reader) {
+        if (_single_reader.reader->is_buffer_empty()) {
+            if (_single_reader.reader->is_end_of_stream()) {
+                _current.clear();
+                return make_ready_future<mutation_fragment_batch>(_current);
+            }
+            return _single_reader.reader->fill_buffer().then([this] { return operator()(); });
+        }
+        _current.clear();
+        _current.emplace_back(_single_reader.reader->pop_mutation_fragment());
+        _single_reader.last_kind = _current.back().mutation_fragment_kind();
+        if (_current.back().is_end_of_partition()) {
+            _next.emplace_back(std::exchange(_single_reader.reader, {}), mutation_fragment::kind::partition_end);
+        }
+        return make_ready_future<mutation_fragment_batch>(_current);
+    }
+
+    if (!_next.empty()) {
+        return prepare_next().then([this] { return (*this)(); });
+    }
+
+    _current.clear();
+
+    // If we ran out of fragments for the current partition, select the
+    // readers for the next one.
+    if (_fragment_heap.empty()) {
+        if (!_halted_readers.empty() || _reader_heap.empty()) {
+            return make_ready_future<mutation_fragment_batch>(_current);
+        }
+
+        auto key = [] (const std::vector<reader_and_fragment>& heap) -> const dht::decorated_key& {
+            return heap.front().fragment.as_partition_start().key();
+        };
+
+        do {
+            boost::range::pop_heap(_reader_heap, reader_heap_compare(*_schema));
+            // All fragments here are partition_start so no need to
+            // heap-sort them.
+            _fragment_heap.emplace_back(std::move(_reader_heap.back()));
+            _reader_heap.pop_back();
+        }
+        while (!_reader_heap.empty() && key(_fragment_heap).equal(*_schema, key(_reader_heap)));
+        if (_fragment_heap.size() == 1) {
+            _single_reader = { _fragment_heap.back().reader, mutation_fragment::kind::partition_start };
+            _current.emplace_back(_fragment_heap.back().fragment);
+            _fragment_heap.clear();
+            return make_ready_future<mutation_fragment_batch>(_current);
        }
    }
-    if (_current.size() == 1) {
-        auto m = std::move(_current.back());
-        _current.pop_back();
-        return make_ready_future<streamed_mutation_opt>(std::move(m));
+
+    const auto equal = position_in_partition::equal_compare(*_schema);
+    do {
+        boost::range::pop_heap(_fragment_heap, fragment_heap_compare(*_schema));
+        auto& n = _fragment_heap.back();
+        const auto kind = n.fragment.mutation_fragment_kind();
+        _current.emplace_back(std::move(n.fragment));
+        _next.emplace_back(n.reader, kind);
+        _fragment_heap.pop_back();
    }
-    return make_ready_future<streamed_mutation_opt>(merge_mutations(std::exchange(_current, {})));
+    while (!_fragment_heap.empty() && equal(_current.back().position(), _fragment_heap.front().fragment.position()));
+
+    return make_ready_future<mutation_fragment_batch>(_current);
 }

-combined_mutation_reader::combined_mutation_reader(std::unique_ptr<reader_selector> selector, mutation_reader::forwarding fwd_mr)
-    : _selector(std::move(selector))
-    , _fwd_mr(fwd_mr)
-{
+void mutation_reader_merger::next_partition() {
+    prepare_forwardable_readers();
+    for (auto& rk : _next) {
+        rk.last_kind = mutation_fragment::kind::partition_end;
+        rk.reader->next_partition();
+    }
 }

-future<> combined_mutation_reader::fast_forward_to(const dht::partition_range& pr) {
-    _ptables.clear();
-    auto rs = _all_readers | boost::adaptors::transformed([] (auto& r) { return &r; });
-    _next.assign(rs.begin(), rs.end());
+future<> mutation_reader_merger::fast_forward_to(const dht::partition_range& pr) {
+    _single_reader = { };
+    _next.clear();
+    _halted_readers.clear();
+    _fragment_heap.clear();
+    _reader_heap.clear();

-    return parallel_for_each(_next, [this, &pr] (mutation_reader* mr) {
-        return mr->fast_forward_to(pr);
+    return parallel_for_each(_all_readers, [this, &pr] (flat_mutation_reader& mr) {
+        _next.emplace_back(&mr, mutation_fragment::kind::partition_end);
+        return mr.fast_forward_to(pr);
    }).then([this, &pr] {
        add_readers(_selector->fast_forward_to(pr));
    });
 }

-future<streamed_mutation_opt> combined_mutation_reader::operator()() {
-    return next();
+future<> mutation_reader_merger::fast_forward_to(position_range pr) {
+    prepare_forwardable_readers();
+    return parallel_for_each(_next, [this, pr = std::move(pr)] (reader_and_last_fragment_kind rk) {
+        return rk.reader->fast_forward_to(pr);
+    });
+}
+
+combined_mutation_reader::combined_mutation_reader(schema_ptr schema,
+        std::unique_ptr<reader_selector> selector,
+        streamed_mutation::forwarding fwd_sm,
+        mutation_reader::forwarding fwd_mr)
+    : impl(std::move(schema))
+    , _producer(_schema, mutation_reader_merger(_schema, std::move(selector), fwd_sm, fwd_mr))
+    , _fwd_sm(fwd_sm) {
+}
+
+future<> combined_mutation_reader::fill_buffer() {
+    return repeat([this] {
+        return _producer().then([this] (mutation_fragment_opt mfo) {
+            if (!mfo) {
+                _end_of_stream = true;
+                return stop_iteration::yes;
+            }
+            push_mutation_fragment(std::move(*mfo));
+            if (is_buffer_full()) {
+                return stop_iteration::yes;
+            }
+            return stop_iteration::no;
+        });
+    });
+}
+
+void combined_mutation_reader::next_partition() {
+    if (_fwd_sm == streamed_mutation::forwarding::yes) {
+        clear_buffer();
+        _end_of_stream = false;
+        _producer.next_partition();
+    } else {
+        clear_buffer_to_next_partition();
+        // If the buffer is empty at this point then all fragments in it
+        // belonged to the current partition, so either:
+        // * All (forwardable) readers are still positioned in the
+        // inside of the current partition, or
+        // * They are between the current one and the next one.
+        // Either way we need to call next_partition on them.
+        if (is_buffer_empty()) {
+            _producer.next_partition();
+        }
+    }
+}
+
+future<> combined_mutation_reader::fast_forward_to(const dht::partition_range& pr) {
+    clear_buffer();
+    _end_of_stream = false;
+    return _producer.fast_forward_to(pr);
+}
+
+future<> combined_mutation_reader::fast_forward_to(position_range pr) {
+    forward_buffer_to(pr.start());
+    _end_of_stream = false;
+    return _producer.fast_forward_to(std::move(pr));
 }

 mutation_reader
-make_combined_reader(std::vector<mutation_reader> readers, mutation_reader::forwarding fwd_mr) {
-    return make_mutation_reader<combined_mutation_reader>(std::make_unique<list_reader_selector>(std::move(readers)), fwd_mr);
+make_combined_reader(schema_ptr schema,
+        std::vector<mutation_reader> readers,
+        streamed_mutation::forwarding fwd_sm,
+        mutation_reader::forwarding fwd_mr) {
+    std::vector<flat_mutation_reader> flat_readers;
+    flat_readers.reserve(readers.size());
+    for (auto& reader : readers) {
+        flat_readers.emplace_back(flat_mutation_reader_from_mutation_reader(schema, std::move(reader), fwd_sm));
+    }
+
+    return mutation_reader_from_flat_mutation_reader(make_flat_mutation_reader<combined_mutation_reader>(
+                    schema,
+                    std::make_unique<list_reader_selector>(schema, std::move(flat_readers)),
+                    fwd_sm,
+                    fwd_mr));
 }

 mutation_reader
-make_combined_reader(mutation_reader&& a, mutation_reader&& b, mutation_reader::forwarding fwd_mr) {
+make_combined_reader(schema_ptr schema,
+        mutation_reader&& a,
+        mutation_reader&& b,
+        streamed_mutation::forwarding fwd_sm,
+        mutation_reader::forwarding fwd_mr) {
    std::vector<mutation_reader> v;
    v.reserve(2);
    v.push_back(std::move(a));
    v.push_back(std::move(b));
-    return make_combined_reader(std::move(v), fwd_mr);
+    return make_combined_reader(std::move(schema), std::move(v), fwd_sm, fwd_mr);
+}
+
+flat_mutation_reader make_combined_reader(schema_ptr schema,
+        std::unique_ptr<reader_selector> selectors,
+        streamed_mutation::forwarding fwd_sm,
+        mutation_reader::forwarding fwd_mr) {
+    return make_flat_mutation_reader<combined_mutation_reader>(schema,
+            std::move(selectors),
+            fwd_sm,
+            fwd_mr);
+}
+
+flat_mutation_reader make_combined_reader(schema_ptr schema,
+        std::vector<flat_mutation_reader> readers,
+        streamed_mutation::forwarding fwd_sm,
+        mutation_reader::forwarding fwd_mr) {
+    return make_flat_mutation_reader<combined_mutation_reader>(schema,
+            std::make_unique<list_reader_selector>(schema, std::move(readers)),
+            fwd_sm,
+            fwd_mr);
+}
+
+flat_mutation_reader make_combined_reader(schema_ptr schema,
+        flat_mutation_reader&& a,
+        flat_mutation_reader&& b,
+        streamed_mutation::forwarding fwd_sm,
+        mutation_reader::forwarding fwd_mr) {
+    std::vector<flat_mutation_reader> v;
+    v.reserve(2);
+    v.push_back(std::move(a));
+    v.push_back(std::move(b));
+    return make_combined_reader(std::move(schema), std::move(v), fwd_sm, fwd_mr);
 }

 class reader_returning final : public mutation_reader::impl {
@@ -255,23 +714,55 @@ mutation_reader make_empty_reader() {
    return make_mutation_reader<empty_reader>();
 }

+const reader_concurrency_semaphore::timeout_clock::duration
+reader_concurrency_semaphore::no_timeout{reader_concurrency_semaphore::timeout_clock::duration::max()};
+
+void reader_concurrency_semaphore::signal(const resources& r) {
+    _resources += r;
+    while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
+        auto& x = _wait_list.front();
+        _resources -= x.res;
+        x.pr.set_value(make_lw_shared<reader_permit>(*this, x.res));
+        _wait_list.pop_front();
+    }
+}
+
+future<lw_shared_ptr<reader_concurrency_semaphore::reader_permit>> reader_concurrency_semaphore::wait_admission(size_t memory) {
+    if (_wait_list.size() >= _max_queue_length) {
+        return make_exception_future<lw_shared_ptr<reader_permit>>(_make_queue_overloaded_exception());
+    }
+    auto r = resources(1, static_cast<ssize_t>(memory));
+    if (may_proceed(r)) {
+        _resources -= r;
+        return make_ready_future<lw_shared_ptr<reader_permit>>(make_lw_shared<reader_permit>(*this, r));
+    }
+    promise<lw_shared_ptr<reader_permit>> pr;
+    auto fut = pr.get_future();
+    if (_timeout == no_timeout) {
+        _wait_list.push_back(entry(std::move(pr), r));
+    } else {
+        _wait_list.push_back(entry(std::move(pr), r), timeout_clock::now() + _timeout);
+    }
+    return fut;
+}
+
 // A file that tracks the memory usage of buffers resulting from read
 // operations.
 class tracking_file_impl : public file_impl {
    file _tracked_file;
-    semaphore* _semaphore;
+    lw_shared_ptr<reader_concurrency_semaphore::reader_permit> _permit;

    // Shouldn't be called if semaphore is NULL.
    temporary_buffer<uint8_t> make_tracked_buf(temporary_buffer<uint8_t> buf) {
        return seastar::temporary_buffer<uint8_t>(buf.get_write(),
                buf.size(),
-                make_deleter(buf.release(), std::bind(&semaphore::signal, _semaphore, buf.size())));
+                make_deleter(buf.release(), std::bind(&reader_concurrency_semaphore::reader_permit::signal_memory, _permit, buf.size())));
    }

 public:
    tracking_file_impl(file file, reader_resource_tracker resource_tracker)
        : _tracked_file(std::move(file))
-        , _semaphore(resource_tracker.get_semaphore()) {
+        , _permit(resource_tracker.get_permit()) {
    }

    tracking_file_impl(const tracking_file_impl&) = delete;
@@ -333,9 +824,9 @@ public:

    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override {
        return get_file_impl(_tracked_file)->dma_read_bulk(offset, range_size, pc).then([this] (temporary_buffer<uint8_t> buf) {
-            if (_semaphore) {
+            if (_permit) {
                buf = make_tracked_buf(std::move(buf));
-                _semaphore->consume(buf.size());
+                _permit->consume_memory(buf.size());
            }
            return make_ready_future<temporary_buffer<uint8_t>>(std::move(buf));
        });
@@ -348,7 +839,7 @@ file reader_resource_tracker::track(file f) const {
 }


-class restricting_mutation_reader : public mutation_reader::impl {
+class restricting_mutation_reader : public flat_mutation_reader::impl {
    struct mutation_source_and_params {
        mutation_source _ms;
        schema_ptr _s;
@@ -359,34 +850,44 @@ class restricting_mutation_reader : public mutation_reader::impl {
        streamed_mutation::forwarding _fwd;
        mutation_reader::forwarding _fwd_mr;

-        mutation_reader operator()() {
-            return _ms(std::move(_s), _range.get(), _slice.get(), _pc.get(), std::move(_trace_state), _fwd, _fwd_mr);
+        flat_mutation_reader operator()(reader_resource_tracker tracker) {
+            return _ms.make_flat_mutation_reader(std::move(_s), _range.get(), _slice.get(), _pc.get(), std::move(_trace_state), _fwd, _fwd_mr, tracker);
        }
    };

-    const restricted_mutation_reader_config& _config;
-    boost::variant<mutation_source_and_params, mutation_reader> _reader_or_mutation_source;
+    struct pending_state {
+        reader_concurrency_semaphore* semaphore;
+        mutation_source_and_params reader_factory;
+    };
+    struct admitted_state {
+        lw_shared_ptr<reader_concurrency_semaphore::reader_permit> permit;
+        flat_mutation_reader reader;
+    };
+    boost::variant<pending_state, admitted_state> _state;

    static const std::size_t new_reader_base_cost{16 * 1024};

-    future<> create_reader() {
-        auto f = _config.timeout.count() != 0
-                ? _config.resources_sem->wait(_config.timeout, new_reader_base_cost)
-                : _config.resources_sem->wait(new_reader_base_cost);
-
-        return f.then([this] {
-            mutation_reader reader = boost::get<mutation_source_and_params>(_reader_or_mutation_source)();
-            _reader_or_mutation_source = std::move(reader);
-
-            if (_config.active_reads) {
-                ++(*_config.active_reads);
+    template<typename Function>
+    GCC6_CONCEPT(
+        requires std::is_move_constructible<Function>::value
+            && requires(Function fn, flat_mutation_reader& reader) {
+                fn(reader);
            }
+    )
+    decltype(auto) with_reader(Function fn) {
+        if (auto* state = boost::get<admitted_state>(&_state)) {
+            return fn(state->reader);
+        }

-            return make_ready_future<>();
+        return boost::get<pending_state>(_state).semaphore->wait_admission(new_reader_base_cost).then(
+                [this, fn = std::move(fn)] (lw_shared_ptr<reader_concurrency_semaphore::reader_permit> permit) mutable {
+            auto reader_factory = std::move(boost::get<pending_state>(_state).reader_factory);
+            _state = admitted_state{permit, reader_factory(reader_resource_tracker(permit))};
+            return fn(boost::get<admitted_state>(_state).reader);
        });
    }
 public:
-    restricting_mutation_reader(const restricted_mutation_reader_config& config,
+    restricting_mutation_reader(reader_concurrency_semaphore& semaphore,
            mutation_source ms,
            schema_ptr s,
            const dht::partition_range& range,
@@ -395,57 +896,61 @@ public:
            tracing::trace_state_ptr trace_state,
            streamed_mutation::forwarding fwd,
            mutation_reader::forwarding fwd_mr)
-        : _config(config)
-        , _reader_or_mutation_source(
-                mutation_source_and_params{std::move(ms), std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr}) {
-        if (_config.resources_sem->waiters() >= _config.max_queue_length) {
-            _config.raise_queue_overloaded_exception();
-        }
+        : impl(s)
+        , _state(pending_state{&semaphore,
+                mutation_source_and_params{std::move(ms), std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr}}) {
    }
-    ~restricting_mutation_reader() {
-        if (boost::get<mutation_reader>(&_reader_or_mutation_source)) {
-            _config.resources_sem->signal(new_reader_base_cost);
-            if (_config.active_reads) {
-                --(*_config.active_reads);
-            }
-        }
-    }
-    future<streamed_mutation_opt> operator()() override {
-        // FIXME: we should defer freeing until the mutation is freed, perhaps,
-        //        rather than just returned
-        if (auto* reader = boost::get<mutation_reader>(&_reader_or_mutation_source)) {
-            return (*reader)();
-        }

-        return create_reader().then([this] {
-            return boost::get<mutation_reader>(_reader_or_mutation_source)();
+    virtual future<> fill_buffer() override {
+        return with_reader([this] (flat_mutation_reader& reader) {
+            return reader.fill_buffer().then([this, &reader] {
+                _end_of_stream = reader.is_end_of_stream();
+                while (!reader.is_buffer_empty()) {
+                    push_mutation_fragment(reader.pop_mutation_fragment());
+                }
+            });
        });
    }
-
-    virtual future<> fast_forward_to(const dht::partition_range& pr) override {
-        if (auto* reader = boost::get<mutation_reader>(&_reader_or_mutation_source)) {
-            return reader->fast_forward_to(pr);
+    virtual void next_partition() override {
+        clear_buffer_to_next_partition();
+        if (!is_buffer_empty()) {
+            return;
        }
-
-        return create_reader().then([this, &pr] {
-            return boost::get<mutation_reader>(_reader_or_mutation_source).fast_forward_to(pr);
+        _end_of_stream = false;
+        if (auto* state = boost::get<admitted_state>(&_state)) {
+            return state->reader.next_partition();
+        }
+    }
+    virtual future<> fast_forward_to(const dht::partition_range& pr) override {
+        clear_buffer();
+        _end_of_stream = false;
+        return with_reader([&pr] (flat_mutation_reader& reader) {
+            return reader.fast_forward_to(pr);
+        });
+    }
+    virtual future<> fast_forward_to(position_range pr) override {
+        forward_buffer_to(pr.start());
+        _end_of_stream = false;
+        return with_reader([pr = std::move(pr)] (flat_mutation_reader& reader) mutable {
+            return reader.fast_forward_to(std::move(pr));
        });
    }
 };

-mutation_reader
-make_restricted_reader(const restricted_mutation_reader_config& config,
-        mutation_source ms,
-        schema_ptr s,
-        const dht::partition_range& range,
-        const query::partition_slice& slice,
-        const io_priority_class& pc,
-        tracing::trace_state_ptr trace_state,
-        streamed_mutation::forwarding fwd,
-        mutation_reader::forwarding fwd_mr) {
-    return make_mutation_reader<restricting_mutation_reader>(config, std::move(ms), std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+flat_mutation_reader
+make_restricted_flat_reader(reader_concurrency_semaphore& semaphore,
+                       mutation_source ms,
+                       schema_ptr s,
+                       const dht::partition_range& range,
+                       const query::partition_slice& slice,
+                       const io_priority_class& pc,
+                       tracing::trace_state_ptr trace_state,
+                       streamed_mutation::forwarding fwd,
+                       mutation_reader::forwarding fwd_mr) {
+    return make_flat_mutation_reader<restricting_mutation_reader>(semaphore, std::move(ms), std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
 }

+
 snapshot_source make_empty_snapshot_source() {
    return snapshot_source([] {
        return make_empty_mutation_source();
@@ -475,7 +980,7 @@ mutation_source make_combined_mutation_source(std::vector<mutation_source> adden
        for (auto&& ms : addends) {
            rd.emplace_back(ms(s, pr, slice, pc, tr, fwd));
        }
-        return make_combined_reader(std::move(rd), mutation_reader::forwarding::yes);
+        return make_combined_reader(s, std::move(rd), fwd);
    });
 }

@@ -547,3 +1052,9 @@ mutation_reader mutation_reader_from_flat_mutation_reader(flat_mutation_reader&&
    };
    return make_mutation_reader<converting_reader>(std::move(mr));
 }
+
+future<streamed_mutation_opt> streamed_mutation_from_flat_mutation_reader(flat_mutation_reader&& r) {
+    return do_with(mutation_reader_from_flat_mutation_reader(std::move(r)), [] (auto&& rd) {
+        return rd();
+    });
+}
--- a/mutation_reader.hh
+++ b/mutation_reader.hh
@@ -30,6 +30,7 @@
 #include "core/do_with.hh"
 #include "tracing/trace_state.hh"
 #include "flat_mutation_reader.hh"
+#include "reader_concurrency_semaphore.hh"

 // A mutation_reader is an object which allows iterating on mutations: invoke
 // the function to get a future for the next mutation, with an unset optional
@@ -107,73 +108,48 @@ make_mutation_reader(Args&&... args) {

 class reader_selector {
 protected:
-    dht::token _selector_position;
+    schema_ptr _s;
+    dht::ring_position _selector_position;
 public:
+    reader_selector(schema_ptr s, dht::ring_position rp) noexcept : _s(std::move(s)), _selector_position(std::move(rp)) {}
+
    virtual ~reader_selector() = default;
    // Call only if has_new_readers() returned true.
-    virtual std::vector<mutation_reader> create_new_readers(const dht::token* const t) = 0;
-    virtual std::vector<mutation_reader> fast_forward_to(const dht::partition_range& pr) = 0;
+    virtual std::vector<flat_mutation_reader> create_new_readers(const dht::token* const t) = 0;
+    virtual std::vector<flat_mutation_reader> fast_forward_to(const dht::partition_range& pr) = 0;

    // Can be false-positive but never false-negative!
    bool has_new_readers(const dht::token* const t) const noexcept {
-        return !_selector_position.is_maximum() && (!t || *t >= _selector_position);
+        dht::ring_position_comparator cmp(*_s);
+        return !_selector_position.is_max() && (!t || cmp(dht::ring_position_view(*t), _selector_position) >= 0);
    }
 };

-// Combines multiple mutation_readers into one.
-class combined_mutation_reader : public mutation_reader::impl {
-    std::unique_ptr<reader_selector> _selector;
-    std::list<mutation_reader> _all_readers;
-
-    struct mutation_and_reader {
-        streamed_mutation m;
-        mutation_reader* read;
-
-        bool operator<(const mutation_and_reader& other) const {
-            return read < other.read;
-        }
-
-        struct less_compare {
-            bool operator()(const mutation_and_reader& a, mutation_reader* b) const {
-                return a.read < b;
-            }
-            bool operator()(mutation_reader* a, const mutation_and_reader& b) const {
-                return a < b.read;
-            }
-            bool operator()(const mutation_and_reader& a, const mutation_and_reader& b) const {
-                return a < b;
-            }
-        };
-    };
-    std::vector<mutation_and_reader> _ptables;
-    // comparison function for std::make_heap()/std::push_heap()
-    static bool heap_compare(const mutation_and_reader& a, const mutation_and_reader& b) {
-        auto&& s = a.m.schema();
-        // order of comparison is inverted, because heaps produce greatest value first
-        return b.m.decorated_key().less_compare(*s, a.m.decorated_key());
-    }
-    std::vector<streamed_mutation> _current;
-    std::vector<mutation_reader*> _next;
-    mutation_reader::forwarding _fwd_mr;
-private:
-    const dht::token* current_position() const;
-    void maybe_add_readers(const dht::token* const t);
-    void add_readers(std::vector<mutation_reader> new_readers);
-    future<> prepare_next();
-    // Produces next mutation or disengaged optional if there are no more.
-    future<streamed_mutation_opt> next();
-public:
-    // The specified mutation_reader::forwarding tag must be the same for all included readers.
-    combined_mutation_reader(std::unique_ptr<reader_selector> selector, mutation_reader::forwarding fwd_mr);
-    virtual future<streamed_mutation_opt> operator()() override;
-    virtual future<> fast_forward_to(const dht::partition_range& pr) override;
-};
-
 // Creates a mutation reader which combines data return by supplied readers.
 // Returns mutation of the same schema only when all readers return mutations
 // of the same schema.
-mutation_reader make_combined_reader(std::vector<mutation_reader>, mutation_reader::forwarding);
-mutation_reader make_combined_reader(mutation_reader&& a, mutation_reader&& b, mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes);
+mutation_reader make_combined_reader(schema_ptr schema,
+        std::vector<mutation_reader> readers,
+        streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no,
+        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes);
+mutation_reader make_combined_reader(schema_ptr schema,
+        mutation_reader&& a,
+        mutation_reader&& b,
+        streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no,
+        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes);
+flat_mutation_reader make_combined_reader(schema_ptr schema,
+        std::vector<flat_mutation_reader>,
+        streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no,
+        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes);
+flat_mutation_reader make_combined_reader(schema_ptr schema,
+        std::unique_ptr<reader_selector>,
+        streamed_mutation::forwarding,
+        mutation_reader::forwarding);
+flat_mutation_reader make_combined_reader(schema_ptr schema,
+        flat_mutation_reader&& a,
+        flat_mutation_reader&& b,
+        streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no,
+        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes);
 // reads from the input readers, in order
 mutation_reader make_reader_returning(mutation, streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);
 mutation_reader make_reader_returning(streamed_mutation);
@@ -184,45 +160,57 @@ mutation_reader make_reader_returning_many(std::vector<mutation>, const dht::par
 mutation_reader make_reader_returning_many(std::vector<streamed_mutation>);
 mutation_reader make_empty_reader();

-/*
-template<typename T>
-concept bool StreamedMutationFilter() {
-    return requires(T t, const streamed_mutation& sm) {
-        { t(sm) } -> bool;
-    };
-}
-*/
 template <typename MutationFilter>
-class filtering_reader : public mutation_reader::impl {
-    mutation_reader _rd;
-    MutationFilter _filter;
-    streamed_mutation_opt _current;
-    static_assert(std::is_same<bool, std::result_of_t<MutationFilter(const streamed_mutation&)>>::value, "bad MutationFilter signature");
-public:
-    filtering_reader(mutation_reader rd, MutationFilter&& filter)
-            : _rd(std::move(rd)), _filter(std::forward<MutationFilter>(filter)) {
+GCC6_CONCEPT(
+    requires requires(MutationFilter mf, const dht::decorated_key& dk) {
+        { mf(dk) } -> bool;
    }
-    virtual future<streamed_mutation_opt> operator()() override {\
-        return repeat([this] {
-            return _rd().then([this] (streamed_mutation_opt&& mo) mutable {
-                if (!mo) {
-                    _current = std::move(mo);
-                    return stop_iteration::yes;
-                } else {
-                    if (_filter(*mo)) {
-                        _current = std::move(mo);
-                        return stop_iteration::yes;
+)
+class filtering_reader : public flat_mutation_reader::impl {
+    flat_mutation_reader _rd;
+    MutationFilter _filter;
+    static_assert(std::is_same<bool, std::result_of_t<MutationFilter(const dht::decorated_key&)>>::value, "bad MutationFilter signature");
+public:
+    filtering_reader(flat_mutation_reader rd, MutationFilter&& filter)
+        : impl(rd.schema())
+        , _rd(std::move(rd))
+        , _filter(std::forward<MutationFilter>(filter)) {
+    }
+    virtual future<> fill_buffer() override {
+        return do_until([this] { return is_buffer_full() || is_end_of_stream(); }, [this] {
+            return _rd.fill_buffer().then([this] {
+                while (!_rd.is_buffer_empty()) {
+                    auto mf = _rd.pop_mutation_fragment();
+                    if (mf.is_partition_start()) {
+                        auto& dk = mf.as_partition_start().key();
+                        if (!_filter(dk)) {
+                            _rd.next_partition();
+                            continue;
+                        }
                    }
-                    return stop_iteration::no;
+                    push_mutation_fragment(std::move(mf));
                }
+                _end_of_stream = _rd.is_end_of_stream();
            });
-        }).then([this] {
-            return make_ready_future<streamed_mutation_opt>(std::move(_current));
        });
-    };
+    }
+    virtual void next_partition() override {
+        clear_buffer_to_next_partition();
+        if (is_buffer_empty()) {
+            _end_of_stream = false;
+            _rd.next_partition();
+        }
+    }
    virtual future<> fast_forward_to(const dht::partition_range& pr) override {
+        clear_buffer();
+        _end_of_stream = false;
        return _rd.fast_forward_to(pr);
    }
+    virtual future<> fast_forward_to(position_range pr) override {
+        forward_buffer_to(pr.start());
+        _end_of_stream = false;
+        return _rd.fast_forward_to(std::move(pr));
+    }
 };

 // Creates a mutation_reader wrapper which creates a new stream of mutations
@@ -231,8 +219,8 @@ public:
 // accepts mutation const& and returns a bool. The mutation stays in the
 // stream if and only if the filter returns true.
 template <typename MutationFilter>
-mutation_reader make_filtering_reader(mutation_reader rd, MutationFilter&& filter) {
-    return make_mutation_reader<filtering_reader<MutationFilter>>(std::move(rd), std::forward<MutationFilter>(filter));
+flat_mutation_reader make_filtering_reader(flat_mutation_reader rd, MutationFilter&& filter) {
+    return make_flat_mutation_reader<filtering_reader<MutationFilter>>(std::move(rd), std::forward<MutationFilter>(filter));
 }

 // Calls the consumer for each element of the reader's stream until end of stream
@@ -288,7 +276,8 @@ class mutation_source {
        io_priority,
        tracing::trace_state_ptr,
        streamed_mutation::forwarding,
-        mutation_reader::forwarding
+        mutation_reader::forwarding,
+        reader_resource_tracker
    )>;
    using flat_reader_factory_type = std::function<flat_mutation_reader(schema_ptr,
                                                                        partition_range,
@@ -296,7 +285,8 @@ class mutation_source {
                                                                        io_priority,
                                                                        tracing::trace_state_ptr,
                                                                        streamed_mutation::forwarding,
-                                                                        mutation_reader::forwarding)>;
+                                                                        mutation_reader::forwarding,
+                                                                        reader_resource_tracker)>;
    class impl {
    public:
        virtual ~impl() { }
@@ -306,14 +296,16 @@ class mutation_source {
                                                     io_priority pc,
                                                     tracing::trace_state_ptr trace_state,
                                                     streamed_mutation::forwarding fwd,
-                                                     mutation_reader::forwarding fwd_mr) = 0;
+                                                     mutation_reader::forwarding fwd_mr,
+                                                     reader_resource_tracker tracker) = 0;
        virtual flat_mutation_reader make_flat_mutation_reader(schema_ptr s,
                                                               partition_range range,
                                                               const query::partition_slice& slice,
                                                               io_priority pc,
                                                               tracing::trace_state_ptr trace_state,
                                                               streamed_mutation::forwarding fwd,
-                                                               mutation_reader::forwarding fwd_mr) = 0;
+                                                               mutation_reader::forwarding fwd_mr,
+                                                               reader_resource_tracker tracker) = 0;
    };
    class mutation_reader_mutation_source : public impl {
        func_type _fn;
@@ -325,8 +317,9 @@ class mutation_source {
                                                     io_priority pc,
                                                     tracing::trace_state_ptr trace_state,
                                                     streamed_mutation::forwarding fwd,
-                                                     mutation_reader::forwarding fwd_mr) override {
-            return _fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+                                                     mutation_reader::forwarding fwd_mr,
+                                                     reader_resource_tracker tracker) override {
+            return _fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker);
        }
        virtual flat_mutation_reader make_flat_mutation_reader(schema_ptr s,
                                                               partition_range range,
@@ -334,9 +327,10 @@ class mutation_source {
                                                               io_priority pc,
                                                               tracing::trace_state_ptr trace_state,
                                                               streamed_mutation::forwarding fwd,
-                                                               mutation_reader::forwarding fwd_mr) override {
+                                                               mutation_reader::forwarding fwd_mr,
+                                                               reader_resource_tracker tracker) override {
            return flat_mutation_reader_from_mutation_reader(s,
-                                                             _fn(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr),
+                                                             _fn(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker),
                                                             fwd);
        }
    };
@@ -350,8 +344,9 @@ class mutation_source {
                                                     io_priority pc,
                                                     tracing::trace_state_ptr trace_state,
                                                     streamed_mutation::forwarding fwd,
-                                                     mutation_reader::forwarding fwd_mr) override {
-            return mutation_reader_from_flat_mutation_reader(_fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr));
+                                                     mutation_reader::forwarding fwd_mr,
+                                                     reader_resource_tracker tracker) override {
+            return mutation_reader_from_flat_mutation_reader(_fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker));
        }
        virtual flat_mutation_reader make_flat_mutation_reader(schema_ptr s,
                                                               partition_range range,
@@ -359,8 +354,9 @@ class mutation_source {
                                                               io_priority pc,
                                                               tracing::trace_state_ptr trace_state,
                                                               streamed_mutation::forwarding fwd,
-                                                               mutation_reader::forwarding fwd_mr) override {
-            return _fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+                                                               mutation_reader::forwarding fwd_mr,
+                                                               reader_resource_tracker tracker) override {
+            return _fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker);
        }
    };
    // We could have our own version of std::function<> that is nothrow
@@ -381,23 +377,78 @@ public:
        : _impl(seastar::make_shared<flat_mutation_reader_mutation_source>(std::move(fn)))
        , _presence_checker_factory(make_lw_shared(std::move(pcf)))
    { }
+
+    mutation_source(std::function<flat_mutation_reader(schema_ptr, partition_range, const query::partition_slice&, io_priority,
+                tracing::trace_state_ptr, streamed_mutation::forwarding, mutation_reader::forwarding)> fn,
+            std::function<partition_presence_checker()> pcf = [] { return make_default_partition_presence_checker(); })
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice& slice,
+                    io_priority pc,
+                    tracing::trace_state_ptr tr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding fwd_mr,
+                    reader_resource_tracker) {
+            return fn(s, range, slice, pc, std::move(tr), fwd, fwd_mr);
+        }
+        , std::move(pcf)) {}
+    mutation_source(std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&, io_priority,
+                tracing::trace_state_ptr, streamed_mutation::forwarding, mutation_reader::forwarding)> fn)
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice& slice,
+                    io_priority pc,
+                    tracing::trace_state_ptr tr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding fwd_mr,
+                    reader_resource_tracker) {
+            return fn(s, range, slice, pc, std::move(tr), fwd, fwd_mr);
+        }) {}
    // For sources which don't care about the mutation_reader::forwarding flag (always fast forwardable)
    mutation_source(std::function<mutation_reader(schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr, streamed_mutation::forwarding)> fn)
-        : mutation_source([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr tr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice& slice,
+                    io_priority pc,
+                    tracing::trace_state_ptr tr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding,
+                    reader_resource_tracker) {
            return fn(s, range, slice, pc, std::move(tr), fwd);
        }) {}
    mutation_source(std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&, io_priority)> fn)
-        : mutation_source([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice& slice,
+                    io_priority pc,
+                    tracing::trace_state_ptr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding,
+                    reader_resource_tracker) {
            assert(!fwd);
            return fn(s, range, slice, pc);
        }) {}
    mutation_source(std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&)> fn)
-        : mutation_source([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice& slice,
+                    io_priority,
+                    tracing::trace_state_ptr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding,
+                    reader_resource_tracker) {
            assert(!fwd);
            return fn(s, range, slice);
        }) {}
    mutation_source(std::function<mutation_reader(schema_ptr, partition_range range)> fn)
-        : mutation_source([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice&, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice&,
+                    io_priority,
+                    tracing::trace_state_ptr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding,
+                    reader_resource_tracker) {
            assert(!fwd);
            return fn(s, range);
        }) {}
@@ -417,9 +468,10 @@ public:
        io_priority pc = default_priority_class(),
        tracing::trace_state_ptr trace_state = nullptr,
        streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
-        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const
+        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes,
+        reader_resource_tracker tracker = no_resource_tracking()) const
    {
-        return _impl->make_mutation_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+        return _impl->make_mutation_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker);
    }

    mutation_reader operator()(schema_ptr s, partition_range range = query::full_partition_range) const {
@@ -435,9 +487,10 @@ public:
        io_priority pc = default_priority_class(),
        tracing::trace_state_ptr trace_state = nullptr,
        streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
-        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const
+        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes,
+        reader_resource_tracker tracker = no_resource_tracking()) const
    {
-        return _impl->make_flat_mutation_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+        return _impl->make_flat_mutation_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker);
    }

    flat_mutation_reader
@@ -480,18 +533,6 @@ public:
 mutation_source make_empty_mutation_source();
 snapshot_source make_empty_snapshot_source();

-struct restricted_mutation_reader_config {
-    semaphore* resources_sem = nullptr;
-    uint64_t* active_reads = nullptr;
-    std::chrono::nanoseconds timeout = {};
-    size_t max_queue_length = std::numeric_limits<size_t>::max();
-    std::function<void ()> raise_queue_overloaded_exception = default_raise_queue_overloaded_exception;
-
-    static void default_raise_queue_overloaded_exception() {
-        throw std::runtime_error("restricted mutation reader queue overload");
-    }
-};
-
 // Creates a restricted reader whose resource usages will be tracked
 // during it's lifetime. If there are not enough resources (dues to
 // existing readers) to create the new reader, it's construction will
@@ -501,7 +542,7 @@ struct restricted_mutation_reader_config {
 // a semaphore to track and limit the memory usage of readers. It also
 // contains a timeout and a maximum queue size for inactive readers
 // whose construction is blocked.
-mutation_reader make_restricted_reader(const restricted_mutation_reader_config& config,
+flat_mutation_reader make_restricted_flat_reader(reader_concurrency_semaphore& semaphore,
        mutation_source ms,
        schema_ptr s,
        const dht::partition_range& range,
@@ -511,12 +552,12 @@ mutation_reader make_restricted_reader(const restricted_mutation_reader_config&
        streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes);

-inline mutation_reader make_restricted_reader(const restricted_mutation_reader_config& config,
-        mutation_source ms,
-        schema_ptr s,
-        const dht::partition_range& range = query::full_partition_range) {
+inline flat_mutation_reader make_restricted_flat_reader(reader_concurrency_semaphore& semaphore,
+                                              mutation_source ms,
+                                              schema_ptr s,
+                                              const dht::partition_range& range = query::full_partition_range) {
    auto& full_slice = s->full_slice();
-    return make_restricted_reader(config, std::move(ms), std::move(s), range, full_slice);
+    return make_restricted_flat_reader(semaphore, std::move(ms), std::move(s), range, full_slice);
 }

 template<>
@@ -547,87 +588,6 @@ future<stop_iteration> do_consume_streamed_mutation_flattened(streamed_mutation&
    return make_ready_future<stop_iteration>(c.consume_end_of_partition());
 }

-template<typename Consumer>
-GCC6_CONCEPT(
-    requires FlattenedConsumer<Consumer>()
-)
-auto consume_flattened(mutation_reader mr, Consumer&& c, bool reverse_mutations = false)
-{
-    return do_with(std::move(mr), std::move(c), stdx::optional<streamed_mutation>(), [reverse_mutations] (auto& mr, auto& c, auto& sm) {
-        return repeat([&, reverse_mutations] {
-            return mr().then([&, reverse_mutations] (auto smopt) {
-                if (!smopt) {
-                    return make_ready_future<stop_iteration>(stop_iteration::yes);
-                }
-                if (!reverse_mutations) {
-                    sm.emplace(std::move(*smopt));
-                } else {
-                    sm.emplace(reverse_streamed_mutation(std::move(*smopt)));
-                }
-                c.consume_new_partition(sm->decorated_key());
-                if (sm->partition_tombstone()) {
-                    c.consume(sm->partition_tombstone());
-                }
-                return do_consume_streamed_mutation_flattened(*sm, c);
-            });
-        }).then([&] {
-            return c.consume_end_of_stream();
-        });
-    });
-}
-
-/*
-template<typename T>
-concept bool StreamedMutationFilter() {
-    return requires(T obj, const streamed_mutation& sm) {
-        { filter(sm); } -> bool;
-    };
-}
-*/
-// This version of consume_flattened() must be run inside a thread and
-// guarantees that all FlattenedConsumer functions will also be called in the same thread
-// context.
-template<typename FlattenedConsumer, typename StreamedMutationFilter>
-auto consume_flattened_in_thread(mutation_reader& mr, FlattenedConsumer& c, StreamedMutationFilter&& filter)
-{
-    while (true) {
-        auto smopt = mr().get0();
-        if (!smopt) {
-            break;
-        }
-        auto& sm = *smopt;
-        if (!filter(sm)) {
-            continue;
-        }
-        c.consume_new_partition(sm.decorated_key());
-        if (sm.partition_tombstone()) {
-            c.consume(sm.partition_tombstone());
-        }
-        do {
-            if (sm.is_buffer_empty()) {
-                if (sm.is_end_of_stream()) {
-                    break;
-                }
-                sm.fill_buffer().get0();
-            } else {
-                if (sm.pop_mutation_fragment().consume_streamed_mutation(c) == stop_iteration::yes) {
-                    break;
-                }
-            }
-        } while (true);
-        if (c.consume_end_of_partition() == stop_iteration::yes) {
-            break;
-        }
-    }
-    return c.consume_end_of_stream();
-}
-
-template<typename FlattenedConsumer>
-auto consume_flattened_in_thread(mutation_reader& mr, FlattenedConsumer& c)
-{
-    return consume_flattened_in_thread(mr, c, [] (auto&&) { return true; });
-}
-
 // Adapts a non-movable FlattenedConsumer to a movable one.
 template<typename FlattenedConsumer>
 class stable_flattened_mutations_consumer {
@@ -647,3 +607,5 @@ template<typename FlattenedConsumer, typename... Args>
 stable_flattened_mutations_consumer<FlattenedConsumer> make_stable_flattened_mutations_consumer(Args&&... args) {
    return { std::make_unique<FlattenedConsumer>(std::forward<Args>(args)...) };
 }
+
+future<streamed_mutation_opt> streamed_mutation_from_flat_mutation_reader(flat_mutation_reader&&);
--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -320,6 +320,145 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
            return _cmp(*b._position, *a._position);
        }
    };
+
+    // The part of the reader that accesses LSA memory directly and works
+    // with reclamation disabled. The state is either immutable (comparators,
+    // snapshot, references to region and alloc section) or dropped on any
+    // allocation section retry (_clustering_rows).
+    class lsa_partition_reader {
+        const schema& _schema;
+        rows_entry::compare _cmp;
+        position_in_partition::equal_compare _eq;
+        heap_compare _heap_cmp;
+
+        lw_shared_ptr<partition_snapshot> _snapshot;
+
+        logalloc::region& _region;
+        logalloc::allocating_section& _read_section;
+
+        partition_snapshot::change_mark _change_mark;
+        std::vector<rows_position> _clustering_rows;
+    private:
+        template<typename Function>
+        decltype(auto) in_alloc_section(Function&& fn) {
+            return _read_section.with_reclaiming_disabled(_region, [&] { 
+                return with_linearized_managed_bytes([&] {
+                    return fn();
+                });
+            });
+        }
+        void refresh_state(const query::clustering_range& ck_range,
+                           const stdx::optional<position_in_partition>& last_row,
+                           range_tombstone_stream& range_tombstones) {
+            _clustering_rows.clear();
+
+            if (!last_row) {
+                // New range. Collect all relevant range tombstone.
+                for (auto&& v : _snapshot->versions()) {
+                    range_tombstones.apply(v.partition().row_tombstones(), ck_range);
+                }
+            }
+
+            for (auto&& v : _snapshot->versions()) {
+                auto cr_end = v.partition().upper_bound(_schema, ck_range);
+                auto cr = [&] () -> mutation_partition::rows_type::const_iterator {
+                    if (last_row) {
+                        return v.partition().clustered_rows().upper_bound(*last_row, _cmp);
+                    } else {
+                        return v.partition().lower_bound(_schema, ck_range);
+                    }
+                }();
+
+                if (cr != cr_end) {
+                    _clustering_rows.emplace_back(rows_position { cr, cr_end });
+                }
+            }
+
+            boost::range::make_heap(_clustering_rows, _heap_cmp);
+        }
+        // Valid if has_more_rows()
+        const rows_entry& pop_clustering_row() {
+            boost::range::pop_heap(_clustering_rows, _heap_cmp);
+            auto& current = _clustering_rows.back();
+            const rows_entry& e = *current._position;
+            current._position = std::next(current._position);
+            if (current._position == current._end) {
+                _clustering_rows.pop_back();
+            } else {
+                boost::range::push_heap(_clustering_rows, _heap_cmp);
+            }
+            return e;
+        }
+        // Valid if has_more_rows()
+        const rows_entry& peek_row() const {
+            return *_clustering_rows.front()._position;
+        }
+        bool has_more_rows() const {
+            return !_clustering_rows.empty();
+        }
+    public:
+        explicit lsa_partition_reader(const schema& s, lw_shared_ptr<partition_snapshot> snp,
+                                      logalloc::region& region, logalloc::allocating_section& read_section)
+            : _schema(s)
+            , _cmp(s)
+            , _eq(s)
+            , _heap_cmp(s)
+            , _snapshot(std::move(snp))
+            , _region(region)
+            , _read_section(read_section)
+        { }
+
+        ~lsa_partition_reader() {
+            maybe_merge_versions(_snapshot, _region, _read_section);
+        }
+
+        template<typename Function>
+        decltype(auto) with_reserve(Function&& fn) {
+            return _read_section.with_reserve(std::forward<Function>(fn));
+        }
+
+        tombstone partition_tombstone() {
+            logalloc::reclaim_lock guard(_region);
+            return _snapshot->partition_tombstone();
+        }
+
+        static_row get_static_row() {
+            return in_alloc_section([&] {
+                return static_row(_snapshot->static_row());
+            });
+        }
+        
+        // Returns next clustered row in the range.
+        // If the ck_range is the same as the one used previously last_row needs
+        // to be engaged and equal the position of the row returned last time.
+        // If the ck_range is different or this is the first call to this
+        // function last_row has to be disengaged. Additionally, when entering
+        // new range range_tombstones will be populated with all relevant
+        // tombstones.
+        mutation_fragment_opt next_row(const query::clustering_range& ck_range,
+                                       const stdx::optional<position_in_partition>& last_row,
+                                       range_tombstone_stream& range_tombstones) {
+            return in_alloc_section([&] () -> mutation_fragment_opt {
+                auto mark = _snapshot->get_change_mark();
+                if (!last_row || mark != _change_mark) {
+                    refresh_state(ck_range, last_row, range_tombstones);
+                    _change_mark = mark;
+                }
+                while (has_more_rows()) {
+                    const rows_entry& e = pop_clustering_row();
+                    if (e.dummy()) {
+                        continue;
+                    }
+                    auto result = mutation_fragment(mutation_fragment::clustering_row_tag_t(), e);
+                    while (has_more_rows() && _eq(peek_row().position(), result.as_clustering_row().position())) {
+                        result.as_mutable_clustering_row().apply(_schema, pop_clustering_row());
+                    }
+                    return result;
+                }
+                return { };
+            });
+        }
+    };
 private:
    // Keeps shared pointer to the container we read mutation from to make sure
    // that its lifetime is appropriately extended.
@@ -328,116 +467,41 @@ private:
    query::clustering_key_filter_ranges _ck_ranges;
    query::clustering_row_ranges::const_iterator _current_ck_range;
    query::clustering_row_ranges::const_iterator _ck_range_end;
-    bool _in_ck_range = false;

-    rows_entry::compare _cmp;
-    position_in_partition::equal_compare _eq;
-    heap_compare _heap_cmp;
-
-    lw_shared_ptr<partition_snapshot> _snapshot;
    stdx::optional<position_in_partition> _last_entry;
-
-    std::vector<rows_position> _clustering_rows;
-
+    mutation_fragment_opt _next_row;
    range_tombstone_stream _range_tombstones;

-    logalloc::region& _lsa_region;
-    logalloc::allocating_section& _read_section;
+    lsa_partition_reader _reader;
+    bool _no_more_rows_in_current_range = false;

    MemoryAccounter& mem_accounter() {
        return *this;
    }
-
-    partition_snapshot::change_mark _change_mark;
 private:
-    void refresh_iterators() {
-        _clustering_rows.clear();
-
-        if (!_in_ck_range) {
-            if (_current_ck_range == _ck_range_end) {
-                _end_of_stream = true;
-                return;
-            }
-            for (auto&& v : _snapshot->versions()) {
-                _range_tombstones.apply(v.partition().row_tombstones(), *_current_ck_range);
-            }
+    void push_static_row() {
+        auto sr = _reader.get_static_row();
+        if (!sr.empty()) {
+            emplace_mutation_fragment(mutation_fragment(std::move(sr)));
        }
-
-        for (auto&& v : _snapshot->versions()) {
-            auto cr_end = v.partition().upper_bound(*_schema, *_current_ck_range);
-            auto cr = [&] () -> mutation_partition::rows_type::const_iterator {
-                if (_in_ck_range) {
-                    return v.partition().clustered_rows().upper_bound(*_last_entry, _cmp);
-                } else {
-                    return v.partition().lower_bound(*_schema, *_current_ck_range);
-                }
-            }();
-
-            if (cr != cr_end) {
-                _clustering_rows.emplace_back(rows_position { cr, cr_end });
-            }
-        }
-
-        _in_ck_range = true;
-        boost::range::make_heap(_clustering_rows, _heap_cmp);
-    }
-
-    // Valid if has_more_rows()
-    const rows_entry& pop_clustering_row() {
-        boost::range::pop_heap(_clustering_rows, _heap_cmp);
-        auto& current = _clustering_rows.back();
-        const rows_entry& e = *current._position;
-        current._position = std::next(current._position);
-        if (current._position == current._end) {
-            _clustering_rows.pop_back();
-        } else {
-            boost::range::push_heap(_clustering_rows, _heap_cmp);
-        }
-        return e;
-    }
-
-    // Valid if has_more_rows()
-    const rows_entry& peek_row() const {
-        return *_clustering_rows.front()._position;
-    }
-
-    bool has_more_rows() const {
-        return !_clustering_rows.empty();
-    }
-
-    mutation_fragment_opt read_static_row() {
-        _last_entry = position_in_partition(position_in_partition::static_row_tag_t());
-        mutation_fragment_opt sr;
-        for (auto&& v : _snapshot->versions()) {
-            if (!v.partition().static_row().empty()) {
-                if (!sr) {
-                    sr = mutation_fragment(static_row(v.partition().static_row()));
-                } else {
-                    sr->as_mutable_static_row().apply(*_schema, v.partition().static_row());
-                }
-            }
-        }
-        return sr;
    }

    mutation_fragment_opt read_next() {
-        while (has_more_rows()) {
-            auto mf = _range_tombstones.get_next(peek_row());
+        if (!_next_row && !_no_more_rows_in_current_range) {
+            _next_row = _reader.next_row(*_current_ck_range, _last_entry, _range_tombstones);
+        }
+        if (_next_row) {
+            auto pos_view = _next_row->as_clustering_row().position();
+            auto mf = _range_tombstones.get_next(pos_view);
            if (mf) {
                return mf;
            }
-            const rows_entry& e = pop_clustering_row();
-            if (e.dummy()) {
-                continue;
-            }
-            clustering_row result = e;
-            while (has_more_rows() && _eq(peek_row().position(), result.position())) {
-                result.apply(*_schema, pop_clustering_row());
-            }
-            _last_entry = position_in_partition(result.position());
-            return mutation_fragment(std::move(result));
+            _last_entry = position_in_partition(pos_view);
+            return std::exchange(_next_row, {});
+        } else {
+            _no_more_rows_in_current_range = true;
+            return _range_tombstones.get_next(position_in_partition_view::for_range_end(*_current_ck_range));
        }
-        return _range_tombstones.get_next();
    }

    void emplace_mutation_fragment(mutation_fragment&& mfopt) {
@@ -445,41 +509,25 @@ private:
        push_mutation_fragment(std::move(mfopt));
    }

+    void on_new_range() {
+        if (_current_ck_range == _ck_range_end) {
+            _end_of_stream = true;
+            push_mutation_fragment(partition_end());
+        }
+        _no_more_rows_in_current_range = false;
+    }
+
    void do_fill_buffer() {
-        if (!_last_entry) {
-            auto mfopt = read_static_row();
-            if (mfopt) {
-                emplace_mutation_fragment(std::move(*mfopt));
-            }
-        }
-
-        auto mark = _snapshot->get_change_mark();
-        if (!_in_ck_range || mark != _change_mark) {
-            refresh_iterators();
-            _change_mark = mark;
-        }
-
        while (!is_end_of_stream() && !is_buffer_full()) {
            auto mfopt = read_next();
            if (mfopt) {
                emplace_mutation_fragment(std::move(*mfopt));
            } else {
-                _in_ck_range = false;
+                _last_entry = stdx::nullopt;
                _current_ck_range = std::next(_current_ck_range);
-                refresh_iterators();
+                on_new_range();
            }
        }
-        if (is_end_of_stream()) {
-            push_mutation_fragment(partition_end());
-        }
-    }
-
-    static tombstone tomb(partition_snapshot& snp) {
-        tombstone t;
-        for (auto& v : snp.versions()) {
-            t.apply(v.partition().partition_tombstone());
-        }
-        return t;
    }
 public:
    template <typename... Args>
@@ -493,28 +541,22 @@ public:
        , _ck_ranges(std::move(crr))
        , _current_ck_range(_ck_ranges.begin())
        , _ck_range_end(_ck_ranges.end())
-        , _cmp(*_schema)
-        , _eq(*_schema)
-        , _heap_cmp(*_schema)
-        , _snapshot(snp)
        , _range_tombstones(*_schema)
-        , _lsa_region(region)
-        , _read_section(read_section) {
-        push_mutation_fragment(partition_start(std::move(dk), tomb(*snp)));
-        do_fill_buffer();
-    }
-
-    ~partition_snapshot_flat_reader() {
-        maybe_merge_versions(_snapshot, _lsa_region, _read_section);
+        , _reader(*_schema, std::move(snp), region, read_section)
+    {
+        _reader.with_reserve([&] {
+            push_mutation_fragment(partition_start(std::move(dk), _reader.partition_tombstone()));
+            push_static_row();
+            on_new_range();
+            do_fill_buffer();
+        });
    }

    virtual future<> fill_buffer() override {
-        return _read_section(_lsa_region, [&] {
-            return with_linearized_managed_bytes([&] {
-                do_fill_buffer();
-                return make_ready_future<>();
-            });
+        _reader.with_reserve([&] {
+            do_fill_buffer();
        });
+        return make_ready_future<>();
    }
    virtual void next_partition() override {
        clear_buffer_to_next_partition();
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -158,14 +158,8 @@ void partition_snapshot::merge_partition_versions() {
        auto current = first_used->next();
        while (current && !current->is_referenced()) {
            auto next = current->next();
-            try {
-                first_used->partition().apply(*_schema, std::move(current->partition()));
-                current_allocator().destroy(current);
-            } catch (...) {
-                // Set _version so that the merge can be retried.
-                _version = partition_version_ref(*current);
-                throw;
-            }
+            first_used->partition().apply(*_schema, std::move(current->partition()));
+            current_allocator().destroy(current);
            current = next;
        }
    }
@@ -184,7 +178,22 @@ unsigned partition_snapshot::version_count()
 partition_entry::partition_entry(mutation_partition mp)
 {
    auto new_version = current_allocator().construct<partition_version>(std::move(mp));
-    _version = partition_version_ref(*new_version);
+    _version = partition_version_ref(*new_version, partition_version::is_evictable::no);
+}
+
+partition_entry::partition_entry(partition_entry::evictable_tag, const schema& s, mutation_partition&& mp)
+    : partition_entry(std::move(mp))
+{
+    _version->partition().ensure_last_dummy(s);
+    _version.make_evictable();
+}
+
+partition_entry partition_entry::make_evictable(const schema& s, mutation_partition&& mp) {
+    return {evictable_tag(), s, std::move(mp)};
+}
+
+partition_entry partition_entry::make_evictable(const schema& s, const mutation_partition& mp) {
+    return make_evictable(s, mutation_partition(mp));
 }

 partition_entry::~partition_entry() {
@@ -204,13 +213,23 @@ partition_entry::~partition_entry() {

 void partition_entry::set_version(partition_version* new_version)
 {
+    bool evictable = _version.evictable();
+
    if (_snapshot) {
        _snapshot->_version = std::move(_version);
        _snapshot->_entry = nullptr;
    }

    _snapshot = nullptr;
-    _version = partition_version_ref(*new_version);
+    _version = partition_version_ref(*new_version, partition_version::is_evictable(evictable));
+}
+
+partition_version& partition_entry::add_version(const schema& s) {
+    auto new_version = current_allocator().construct<partition_version>(mutation_partition(s.shared_from_this()));
+    new_version->partition().set_static_row_continuous(_version->partition().static_row_continuous());
+    new_version->insert_before(*_version);
+    set_version(new_version);
+    return *new_version;
 }

 void partition_entry::apply(const schema& s, const mutation_partition& mp, const schema& mp_schema)
@@ -437,7 +456,7 @@ void partition_entry::with_detached_versions(Func&& func) {
        snapshot->_entry = nullptr;
        _snapshot = nullptr;
    }
-    _version = { };
+    auto prev = std::exchange(_version, {});

    auto revert = defer([&] {
        if (snapshot) {
@@ -445,7 +464,7 @@ void partition_entry::with_detached_versions(Func&& func) {
            snapshot->_entry = this;
            _version = std::move(snapshot->_version);
        } else {
-            _version = partition_version_ref(*current);
+            _version = std::move(prev);
        }
    });

@@ -555,27 +574,35 @@ lw_shared_ptr<partition_snapshot> partition_entry::read(logalloc::region& r,
 std::vector<range_tombstone>
 partition_snapshot::range_tombstones(const ::schema& s, position_in_partition_view start, position_in_partition_view end)
 {
+    partition_version* v = &*version();
+    if (!v->next()) {
+        return boost::copy_range<std::vector<range_tombstone>>(
+            v->partition().row_tombstones().slice(s, start, end));
+    }
    range_tombstone_list list(s);
-    for (auto&& v : versions()) {
-        for (auto&& rt : v.partition().row_tombstones().slice(s, start, end)) {
+    while (v) {
+        for (auto&& rt : v->partition().row_tombstones().slice(s, start, end)) {
            list.apply(s, rt);
        }
+        v = v->next();
    }
-    return boost::copy_range<std::vector<range_tombstone>>(list);
+    return boost::copy_range<std::vector<range_tombstone>>(list.slice(s, start, end));
 }

-std::ostream& operator<<(std::ostream& out, partition_entry& e) {
+std::ostream& operator<<(std::ostream& out, const partition_entry& e) {
    out << "{";
    bool first = true;
    if (e._version) {
-        for (const partition_version& v : e.versions()) {
+        const partition_version* v = &*e._version;
+        while (v) {
            if (!first) {
                out << ", ";
            }
-            if (v.is_referenced()) {
+            if (v->is_referenced()) {
                out << "(*) ";
            }
-            out << v.partition();
+            out << v->partition();
+            v = v->next();
            first = false;
        }
    }
@@ -587,6 +614,7 @@ void partition_entry::evict() noexcept {
    if (!_version) {
        return;
    }
+    // Must evict from all versions atomically to keep snapshots consistent.
    for (auto&& v : versions()) {
        v.partition().evict();
    }
--- a/partition_version.hh
+++ b/partition_version.hh
@@ -117,6 +117,8 @@ class partition_version : public anchorless_list_base_hook<partition_version> {

    friend class partition_version_ref;
 public:
+    using is_evictable = bool_class<class evictable_tag>;
+
    explicit partition_version(schema_ptr s) noexcept
        : _partition(std::move(s)) { }
    explicit partition_version(mutation_partition mp) noexcept
@@ -139,11 +141,15 @@ using partition_version_range = anchorless_list_base_hook<partition_version>::ra
 class partition_version_ref {
    partition_version* _version = nullptr;
    bool _unique_owner = false;
+    bool _evictable;

    friend class partition_version;
 public:
    partition_version_ref() = default;
-    explicit partition_version_ref(partition_version& pv) noexcept : _version(&pv) {
+    explicit partition_version_ref(partition_version& pv, partition_version::is_evictable ev) noexcept
+        : _version(&pv)
+        , _evictable(ev)
+    {
        assert(!_version->_backref);
        _version->_backref = this;
    }
@@ -152,7 +158,10 @@ public:
            _version->_backref = nullptr;
        }
    }
-    partition_version_ref(partition_version_ref&& other) noexcept : _version(other._version) {
+    partition_version_ref(partition_version_ref&& other) noexcept
+        : _version(other._version)
+        , _evictable(other._evictable)
+    {
        if (_version) {
            _version->_backref = this;
        }
@@ -172,6 +181,10 @@ public:
        assert(_version);
        return *_version;
    }
+    const partition_version& operator*() const {
+        assert(_version);
+        return *_version;
+    }
    partition_version* operator->() {
        assert(_version);
        return _version;
@@ -183,6 +196,8 @@ public:

    bool is_unique_owner() const { return _unique_owner; }
    void mark_as_unique_owner() { _unique_owner = true; }
+    void make_evictable() { _evictable = true; }
+    bool evictable() const { return _evictable; }
 };

 class partition_entry;
@@ -280,6 +295,11 @@ public:
 // objects called versions. The logical mutation_partition state represented
 // by that chain is equal to reducing the chain using mutation_partition::apply()
 // from left (latest version) to right.
+//
+// We distinguish evictable and non-evictable partition entries. Entries which
+// are non-evictable have all their elements non-evictable and fully continuous.
+// Partition snapshots inherit evictability of the entry, which remains invariant
+// for a snapshot.
 class partition_entry {
    partition_snapshot* _snapshot = nullptr;
    partition_version_ref _version;
@@ -296,11 +316,19 @@ private:

    void apply_to_incomplete(const schema& s, partition_version* other);
 public:
+    struct evictable_tag {};
    class rows_iterator;
+    // Constructs a non-evictable entry holding empty partition
    partition_entry() = default;
+    // Constructs a non-evictable entry
    explicit partition_entry(mutation_partition mp);
+    // Constructs an evictable entry
+    partition_entry(evictable_tag, const schema& s, mutation_partition&& mp);
    ~partition_entry();

+    static partition_entry make_evictable(const schema& s, mutation_partition&& mp);
+    static partition_entry make_evictable(const schema& s, const mutation_partition& mp);
+
    partition_entry(partition_entry&& pe) noexcept
        : _snapshot(pe._snapshot), _version(std::move(pe._version))
    {
@@ -331,14 +359,13 @@ public:

    // Strong exception guarantees.
    // Assumes this instance and mp are fully continuous.
+    // Use only on non-evictable entries.
    void apply(const schema& s, const mutation_partition& mp, const schema& mp_schema);
-
-    // Strong exception guarantees.
-    // Assumes this instance and mpv are fully continuous.
    void apply(const schema& s, mutation_partition_view mpv, const schema& mp_schema);

    // Adds mutation_partition represented by "other" to the one represented
    // by this entry.
+    // This entry must be evictable.
    //
    // The argument must be fully-continuous.
    //
@@ -356,17 +383,15 @@ public:
    // succeeds the result will be as if the first attempt didn't fail.
    void apply_to_incomplete(const schema& s, partition_entry&& pe, const schema& pe_schema);

+    partition_version& add_version(const schema& s);
+
    // Ensures that the latest version can be populated with data from given phase
    // by inserting a new version if necessary.
    // Doesn't affect value or continuity of the partition.
    // Returns a reference to the new latest version.
    partition_version& open_version(const schema& s, partition_snapshot::phase_type phase = partition_snapshot::max_phase) {
        if (_snapshot && _snapshot->_phase != phase) {
-            auto new_version = current_allocator().construct<partition_version>(mutation_partition(s.shared_from_this()));
-            new_version->partition().set_static_row_continuous(_version->partition().static_row_continuous());
-            new_version->insert_before(*_version);
-            set_version(new_version);
-            return *new_version;
+            return add_version(s);
        }
        return *_version;
    }
@@ -382,7 +407,7 @@ public:
    lw_shared_ptr<partition_snapshot> read(logalloc::region& region, schema_ptr entry_schema,
        partition_snapshot::phase_type phase = partition_snapshot::default_phase);

-    friend std::ostream& operator<<(std::ostream& out, partition_entry& e);
+    friend std::ostream& operator<<(std::ostream& out, const partition_entry& e);
 };

 inline partition_version_ref& partition_snapshot::version()
--- a/range_tombstone.cc
+++ b/range_tombstone.cc
@@ -65,21 +65,30 @@ void range_tombstone_accumulator::update_current_tombstone() {
 }

 void range_tombstone_accumulator::drop_unneeded_tombstones(const clustering_key_prefix& ck, int w) {
-    auto cmp = [&] (bound_view bv, const clustering_key_prefix& ck, int w) {
-        return _reversed ? _cmp(ck, w, bv.prefix, weight(bv.kind)) : _cmp(bv.prefix, weight(bv.kind), ck, w);
+    auto cmp = [&] (const range_tombstone& rt, const clustering_key_prefix& ck, int w) {
+        if (_reversed) {
+            auto bv = rt.start_bound();
+            return _cmp(ck, w, bv.prefix, weight(bv.kind));
+        }
+        auto bv = rt.end_bound();
+        return _cmp(bv.prefix, weight(bv.kind), ck, w);
    };
-    while (!_range_tombstones.empty() && cmp(_range_tombstones.begin()->end_bound(), ck, w)) {
+    while (!_range_tombstones.empty() && cmp(*_range_tombstones.begin(), ck, w)) {
        _range_tombstones.pop_front();
    }
    update_current_tombstone();
 }

 void range_tombstone_accumulator::apply(range_tombstone rt) {
-    drop_unneeded_tombstones(rt.start, weight(rt.start_kind));
+    if (_reversed) {
+        drop_unneeded_tombstones(rt.end, weight(rt.end_kind));
+    } else {
+        drop_unneeded_tombstones(rt.start, weight(rt.start_kind));
+    }
    _current_tombstone.apply(rt.tomb);

    auto cmp = [&] (const range_tombstone& rt1, const range_tombstone& rt2) {
-        return _reversed ? _cmp(rt2.end_bound(), rt1.end_bound()) : _cmp(rt1.end_bound(), rt2.end_bound());
+        return _reversed ? _cmp(rt2.start_bound(), rt1.start_bound()) : _cmp(rt1.end_bound(), rt2.end_bound());
    };
    _range_tombstones.insert(boost::upper_bound(_range_tombstones, rt, cmp), std::move(rt));
 }
--- a/range_tombstone.hh
+++ b/range_tombstone.hh
@@ -176,15 +176,6 @@ public:
    size_t memory_usage() const {
        return sizeof(range_tombstone) + external_memory_usage();
    }
-
-    // Flips start and end bound so that range tombstone can be used in reversed
-    // streams.
-    void flip() {
-        std::swap(start, end);
-        std::swap(start_kind, end_kind);
-        start_kind = flip_bound_kind(start_kind);
-        end_kind = flip_bound_kind(end_kind);
-    }
 private:
    void move_assign(range_tombstone&& rt) {
        start = std::move(rt.start);
@@ -202,11 +193,12 @@ private:
    }
 };

-// This is a helper intended for accumulating tombstones from a streamed
-// mutation and determining what is the tombstone for a given clustering row.
+// The accumulator expects the incoming range tombstones and clustered rows to
+// follow the ordering used by the mutation readers.
 //
-// After apply(rt) or tombstone_for_row(ck) are called there are followng
-// restrictions for subsequent calls:
+// Unless the accumulator is in the reverse mode, after apply(rt) or
+// tombstone_for_row(ck) are called there are followng restrictions for
+// subsequent calls:
 //  - apply(rt1) can be invoked only if rt.start_bound() < rt1.start_bound()
 //    and ck < rt1.start_bound()
 //  - tombstone_for_row(ck1) can be invoked only if rt.start_bound() < ck1
@@ -214,6 +206,15 @@ private:
 //
 // In other words position in partition of the mutation fragments passed to the
 // accumulator must be increasing.
+//
+// If the accumulator was created with the reversed flag set it expects the
+// stream of the range tombstone to come from a reverse partitions and follow
+// the ordering that they use. In particular, the restrictions from non-reversed
+// mode change to:
+//  - apply(rt1) can be invoked only if rt.end_bound() > rt1.end_bound() and
+//    ck > rt1.end_bound()
+//  - tombstone_for_row(ck1) can be invoked only if rt.end_bound() > ck1 and
+//    ck > ck1.
 class range_tombstone_accumulator {
    bound_view::compare _cmp;
    tombstone _partition_tombstone;
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -124,6 +124,7 @@ void range_tombstone_list::insert_from(const schema& s,
            if (less(end_bound, it->end_bound())) {
                end = it->end;
                end_kind = it->end_kind;
+                end_bound = bound_view(end, end_kind);
            }
            it = rev.erase(it);
        } else if (c > 0) {
--- a/read_context.hh
+++ b/read_context.hh
@@ -31,13 +31,13 @@
 namespace cache {

 /*
- * Represent a reader to the underlying source.
- * This reader automatically makes sure that it's up to date with all cache updates
- */
+* Represent a flat reader to the underlying source.
+* This reader automatically makes sure that it's up to date with all cache updates
+*/
 class autoupdating_underlying_reader final {
    row_cache& _cache;
    read_context& _read_context;
-    stdx::optional<mutation_reader> _reader;
+    stdx::optional<flat_mutation_reader> _reader;
    utils::phased_barrier::phase_type _reader_creation_phase;
    dht::partition_range _range = { };
    stdx::optional<dht::decorated_key> _last_key;
@@ -47,17 +47,7 @@ public:
        : _cache(cache)
        , _read_context(context)
    { }
-    // Reads next partition without changing mutation source snapshot.
-    future<streamed_mutation_opt> read_next_same_phase() {
-        _last_key = std::move(_new_last_key);
-        return (*_reader)().then([this] (auto&& smopt) {
-            if (smopt) {
-                _new_last_key = smopt->decorated_key();
-            }
-            return std::move(smopt);
-        });
-    }
-    future<streamed_mutation_opt> operator()() {
+    future<mutation_fragment_opt> move_to_next_partition() {
        _last_key = std::move(_new_last_key);
        auto start = population_range_start();
        auto phase = _cache.phase_of(start);
@@ -66,7 +56,8 @@ public:
                auto cmp = dht::ring_position_comparator(*_cache._schema);
                auto&& new_range = _range.split_after(*_last_key, cmp);
                if (!new_range) {
-                    return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
+                    _reader = {};
+                    return make_ready_future<mutation_fragment_opt>();
                }
                _range = std::move(*new_range);
                _last_key = {};
@@ -79,11 +70,17 @@ public:
            _reader = _cache.create_underlying_reader(_read_context, snap, _range);
            _reader_creation_phase = phase;
        }
-        return (*_reader)().then([this] (auto&& smopt) {
-            if (smopt) {
-                _new_last_key = smopt->decorated_key();
+        _reader->next_partition();
+
+        if (_reader->is_end_of_stream() && _reader->is_buffer_empty()) {
+            return make_ready_future<mutation_fragment_opt>();
+        }
+        return (*_reader)().then([this] (auto&& mfopt) {
+            if (mfopt) {
+                assert(mfopt->is_partition_start());
+                _new_last_key = mfopt->as_partition_start().key();
            }
-            return std::move(smopt);
+            return std::move(mfopt);
        });
    }
    future<> fast_forward_to(dht::partition_range&& range) {
@@ -114,6 +111,7 @@ public:
    const dht::partition_range& range() const {
        return _range;
    }
+    flat_mutation_reader& underlying() { return *_reader; }
    dht::ring_position_view population_range_start() const {
        return _last_key ? dht::ring_position_view::for_after_key(*_last_key)
                         : dht::ring_position_view::for_range_start(_range);
@@ -130,19 +128,17 @@ class read_context final : public enable_lw_shared_from_this<read_context> {
    streamed_mutation::forwarding _fwd;
    mutation_reader::forwarding _fwd_mr;
    bool _range_query;
+    // When reader enters a partition, it must be set up for reading that
+    // partition from the underlying mutation source (_underlying) in one of two ways:
+    //
+    //  1) either _underlying is already in that partition
+    //
+    //  2) _underlying is before the partition, then _underlying_snapshot and _key
+    //     are set so that _underlying_flat can be fast forwarded to the right partition.
+    //
    autoupdating_underlying_reader _underlying;
    uint64_t _underlying_created = 0;

-    // When reader enters a partition, it must be set up for reading that
-    // partition from the underlying mutation source (_sm) in one of two ways:
-    //
-    //  1) either _underlying is already in that partition, then _sm is set to the
-    //     stream obtained from it.
-    //
-    //  2) _underlying is before the partition, then _underlying_snapshot and _key
-    //     are set so that _sm can be created on demand.
-    //
-    streamed_mutation_opt _sm;
    mutation_source_opt _underlying_snapshot;
    dht::partition_range _sm_range;
    stdx::optional<dht::decorated_key> _key;
@@ -168,6 +164,9 @@ public:
        , _underlying(_cache, *this)
    {
        ++_cache._tracker._stats.reads;
+        if (range.is_singular() && range.start()->value().has_key()) {
+            _key = range.start()->value().as_decorated_key();
+        }
    }
    ~read_context() {
        ++_cache._tracker._stats.reads_done;
@@ -190,52 +189,37 @@ public:
    bool is_range_query() const { return _range_query; }
    autoupdating_underlying_reader& underlying() { return _underlying; }
    row_cache::phase_type phase() const { return _phase; }
-    const dht::decorated_key& key() const { return _key ? *_key : _sm->decorated_key(); }
+    const dht::decorated_key& key() const { return *_key; }
    void on_underlying_created() { ++_underlying_created; }
 private:
-    future<> create_sm();
-    future<> ensure_sm_created() {
-        if (_sm) {
-            return make_ready_future<>();
+    future<> ensure_underlying() {
+        if (_underlying_snapshot) {
+            return create_underlying(true);
        }
-        return create_sm();
+        return make_ready_future<>();
    }
 public:
-    // Prepares the underlying streamed_mutation to represent dk in given snapshot.
-    // Partitions must be entered with strictly monotonic keys.
-    // The key must be after the current range of the underlying() reader.
-    // The phase argument must match the snapshot's phase.
+    future<> create_underlying(bool skip_first_fragment);
    void enter_partition(const dht::decorated_key& dk, mutation_source& snapshot, row_cache::phase_type phase) {
        _phase = phase;
-        _sm = {};
        _underlying_snapshot = snapshot;
        _key = dk;
    }
-    // Prepares the underlying streamed_mutation to be sm.
-    // The phase argument must match the phase of the snapshot used to obtain sm.
-    void enter_partition(streamed_mutation&& sm, row_cache::phase_type phase) {
+    void enter_partition(const dht::decorated_key& dk, row_cache::phase_type phase) {
        _phase = phase;
-        _sm = std::move(sm);
        _underlying_snapshot = {};
+        _key = dk;
    }
    // Fast forwards the underlying streamed_mutation to given range.
    future<> fast_forward_to(position_range range) {
-        return ensure_sm_created().then([this, range = std::move(range)] () mutable {
-            ++_cache._tracker._stats.underlying_row_skips;
-            return _sm->fast_forward_to(std::move(range));
+        return ensure_underlying().then([this, range = std::move(range)] {
+            return _underlying.underlying().fast_forward_to(std::move(range));
        });
    }
-    // Returns the underlying streamed_mutation.
-    // The caller has to ensure that the streamed mutation was already created
-    // (e.g. the most recent call to enter_partition(const dht::decorated_key&, ...)
-    // was followed by a call to fast_forward_to()).
-    streamed_mutation& get_streamed_mutation() noexcept {
-        return *_sm;
-    }
-    // Gets the next fragment from the underlying streamed_mutation
+    // Gets the next fragment from the underlying reader
    future<mutation_fragment_opt> get_next_fragment() {
-        return ensure_sm_created().then([this] {
-            return (*_sm)();
+        return ensure_underlying().then([this] {
+            return _underlying.underlying()();
        });
    }
 };
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -0,0 +1,204 @@
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+#pragma once
+
+#include <core/file.hh>
+#include <core/semaphore.hh>
+
+/// Specific semaphore for controlling reader concurrency
+///
+/// Before creating a reader one should obtain a permit by calling
+/// `wait_admission()`. This permit can then be used for tracking the
+/// reader's memory consumption via `reader_resource_tracker`.
+/// The permit should be held onto for the lifetime of the reader
+/// and/or any buffer its tracking.
+/// Reader concurrency is dual limited by count and memory.
+/// The semaphore can be configured with the desired limits on
+/// construction. New readers will only be admitted when there is both
+/// enough count and memory units available. Readers are admitted in
+/// FIFO order.
+/// It's possible to specify the maximum allowed number of waiting
+/// readers by the `max_queue_length` constructor parameter. When the
+/// number waiting readers would be equal or greater than this number
+/// (when calling `wait_admission()`) an exception will be thrown.
+/// The type of the exception and optionally some additional code
+/// that should be executed when this happens can be customized by the
+/// `raise_queue_overloaded_exception` constructor parameter. This
+/// function will be called every time the queue limit is surpassed.
+/// It is expected to return an `std::exception_ptr` that will be
+/// injected into the future.
+class reader_concurrency_semaphore {
+public:
+    using timeout_clock = lowres_clock;
+    static const timeout_clock::duration no_timeout;
+
+    struct resources {
+        int count = 0;
+        ssize_t memory = 0;
+
+        resources() = default;
+
+        resources(int count, ssize_t memory)
+            : count(count)
+            , memory(memory) {
+        }
+
+        bool operator>=(const resources& other) const {
+            return count >= other.count && memory >= other.memory;
+        }
+
+        resources& operator-=(const resources& other) {
+            count -= other.count;
+            memory -= other.memory;
+            return *this;
+        }
+
+        resources& operator+=(const resources& other) {
+            count += other.count;
+            memory += other.memory;
+            return *this;
+        }
+
+        explicit operator bool() const {
+            return count >= 0 && memory >= 0;
+        }
+    };
+
+    class reader_permit {
+        reader_concurrency_semaphore& _semaphore;
+        const resources _base_cost;
+    public:
+        reader_permit(reader_concurrency_semaphore& semaphore, resources base_cost)
+            : _semaphore(semaphore)
+            , _base_cost(base_cost) {
+        }
+
+        ~reader_permit() {
+            _semaphore.signal(_base_cost);
+        }
+
+        reader_permit(const reader_permit&) = delete;
+        reader_permit& operator=(const reader_permit&) = delete;
+
+        reader_permit(reader_permit&& other) = delete;
+        reader_permit& operator=(reader_permit&& other) = delete;
+
+        void consume_memory(size_t memory) {
+            _semaphore.consume_memory(memory);
+        }
+
+        void signal_memory(size_t memory) {
+            _semaphore.signal_memory(memory);
+        }
+    };
+
+private:
+    static std::exception_ptr default_make_queue_overloaded_exception() {
+        return std::make_exception_ptr(std::runtime_error("restricted mutation reader queue overload"));
+    }
+
+    resources _resources;
+
+    struct entry {
+        promise<lw_shared_ptr<reader_permit>> pr;
+        resources res;
+        entry(promise<lw_shared_ptr<reader_permit>>&& pr, resources r) : pr(std::move(pr)), res(r) {}
+    };
+    struct expiry_handler {
+        void operator()(entry& e) noexcept {
+            e.pr.set_exception(semaphore_timed_out());
+        }
+    };
+    expiring_fifo<entry, expiry_handler, timeout_clock> _wait_list;
+
+    timeout_clock::duration _timeout;
+    size_t _max_queue_length = std::numeric_limits<size_t>::max();
+    std::function<std::exception_ptr()> _make_queue_overloaded_exception = default_make_queue_overloaded_exception;
+
+    bool has_available_units(const resources& r) const {
+        return bool(_resources) && _resources >= r;
+    }
+
+    bool may_proceed(const resources& r) const {
+        return has_available_units(r) && _wait_list.empty();
+    }
+
+    void consume_memory(size_t memory) {
+        _resources.memory -= memory;
+    }
+
+    void signal(const resources& r);
+
+    void signal_memory(size_t memory) {
+        signal(resources(0, static_cast<ssize_t>(memory)));
+    }
+public:
+    reader_concurrency_semaphore(unsigned count,
+            size_t memory,
+            timeout_clock::duration timeout = no_timeout,
+            size_t max_queue_length = std::numeric_limits<size_t>::max(),
+            std::function<std::exception_ptr()> raise_queue_overloaded_exception = default_make_queue_overloaded_exception)
+        : _resources(count, memory)
+        , _timeout(timeout)
+        , _max_queue_length(max_queue_length)
+        , _make_queue_overloaded_exception(raise_queue_overloaded_exception) {
+    }
+
+    reader_concurrency_semaphore(const reader_concurrency_semaphore&) = delete;
+    reader_concurrency_semaphore& operator=(const reader_concurrency_semaphore&) = delete;
+
+    reader_concurrency_semaphore(reader_concurrency_semaphore&&) = delete;
+    reader_concurrency_semaphore& operator=(reader_concurrency_semaphore&&) = delete;
+
+    future<lw_shared_ptr<reader_permit>> wait_admission(size_t memory);
+
+    const resources available_resources() const {
+        return _resources;
+    }
+
+    size_t waiters() const {
+        return _wait_list.size();
+    }
+};
+
+class reader_resource_tracker {
+    lw_shared_ptr<reader_concurrency_semaphore::reader_permit> _permit;
+public:
+    reader_resource_tracker() = default;
+    explicit reader_resource_tracker(lw_shared_ptr<reader_concurrency_semaphore::reader_permit> permit)
+        : _permit(std::move(permit)) {
+    }
+
+    bool operator==(const reader_resource_tracker& other) const {
+        return _permit == other._permit;
+    }
+
+    file track(file f) const;
+
+    lw_shared_ptr<reader_concurrency_semaphore::reader_permit> get_permit() const {
+        return _permit;
+    }
+};
+
+inline reader_resource_tracker no_resource_tracking() {
+    return {};
+}
--- a/reader_resource_tracker.hh
+++ b/reader_resource_tracker.hh
@@ -1,48 +0,0 @@
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Copyright (C) 2017 ScyllaDB
- */
-
-#pragma once
-
-#include <core/file.hh>
-#include <core/semaphore.hh>
-
-class reader_resource_tracker {
-    seastar::semaphore* _sem = nullptr;
-public:
-    reader_resource_tracker() = default;
-    explicit reader_resource_tracker(seastar::semaphore* sem)
-        : _sem(sem) {
-    }
-
-    bool operator==(const reader_resource_tracker& other) const {
-        return _sem == other._sem;
-    }
-
-    file track(file f) const;
-
-    semaphore* get_semaphore() const {
-        return _sem;
-    }
-};
-
-inline reader_resource_tracker no_resource_tracking() {
-    return reader_resource_tracker(nullptr);
-}
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -32,10 +32,10 @@
 #include <boost/version.hpp>
 #include <sys/sdt.h>
 #include "stdx.hh"
-#include "cache_streamed_mutation.hh"
 #include "read_context.hh"
 #include "schema_upgrader.hh"
 #include "dirty_memory_manager.hh"
+#include "cache_flat_mutation_reader.hh"

 namespace cache {

@@ -48,10 +48,10 @@ using namespace cache;

 thread_local seastar::thread_scheduling_group row_cache::_update_thread_scheduling_group(1ms, 0.2);

-mutation_reader
+flat_mutation_reader
 row_cache::create_underlying_reader(read_context& ctx, mutation_source& src, const dht::partition_range& pr) {
    ctx.on_underlying_created();
-    return src(_schema, pr, ctx.slice(), ctx.pc(), ctx.trace_state(), streamed_mutation::forwarding::yes);
+    return src.make_flat_mutation_reader(_schema, pr, ctx.slice(), ctx.pc(), ctx.trace_state(), streamed_mutation::forwarding::yes);
 }

 cache_tracker& global_cache_tracker() {
@@ -311,7 +311,7 @@ public:
    }
 };

-future<> read_context::create_sm() {
+future<> read_context::create_underlying(bool skip_first_fragment) {
    if (_range_query) {
        // FIXME: Singular-range mutation readers don't support fast_forward_to(), so need to use a wide range
        // here in case the same reader will need to be fast forwarded later.
@@ -319,53 +319,44 @@ future<> read_context::create_sm() {
    } else {
        _sm_range = dht::partition_range::make_singular({dht::ring_position(*_key)});
    }
-    return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase).then([this] {
-        return _underlying.read_next_same_phase().then([this] (auto&& smo) {
-            if (!smo) {
-                _sm = make_empty_streamed_mutation(_cache.schema(), *_key, streamed_mutation::forwarding::yes);
-            } else {
-                _sm = std::move(*smo);
-            }
-        });
+    return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase).then([this, skip_first_fragment] {
+        _underlying_snapshot = {};
+        if (skip_first_fragment) {
+            return _underlying.underlying()().then([](auto &&mf) {});
+        } else {
+            return make_ready_future<>();
+        }
    });
 }

-static streamed_mutation read_directly_from_underlying(streamed_mutation&& sm, read_context& reader) {
-    if (reader.schema()->version() != sm.schema()->version()) {
-        sm = transform(std::move(sm), schema_upgrader(reader.schema()));
+static flat_mutation_reader read_directly_from_underlying(read_context& reader) {
+    flat_mutation_reader res = make_delegating_reader(reader.underlying().underlying());
+    if (reader.schema()->version() != reader.underlying().underlying().schema()->version()) {
+        res = transform(std::move(res), schema_upgrader(reader.schema()));
    }
    if (reader.fwd() == streamed_mutation::forwarding::no) {
-        sm = streamed_mutation_from_forwarding_streamed_mutation(std::move(sm));
+        res = make_nonforwardable(std::move(res), true);
    }
-    return std::move(sm);
+    return std::move(res);
 }

 // Reader which populates the cache using data from the delegate.
-class single_partition_populating_reader final : public mutation_reader::impl {
+class single_partition_populating_reader final : public flat_mutation_reader::impl {
    row_cache& _cache;
-    mutation_reader _delegate;
    lw_shared_ptr<read_context> _read_context;
-public:
-    single_partition_populating_reader(row_cache& cache,
-            lw_shared_ptr<read_context> context)
-        : _cache(cache)
-        , _read_context(std::move(context))
-    { }
-
-    virtual future<streamed_mutation_opt> operator()() override {
-        if (!_read_context) {
-            return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
-        }
+    flat_mutation_reader_opt _reader;
+private:
+    future<> create_reader() {
        auto src_and_phase = _cache.snapshot_of(_read_context->range().start()->value());
        auto phase = src_and_phase.phase;
-        _delegate = _cache.create_underlying_reader(*_read_context, src_and_phase.snapshot, _read_context->range());
-        return _delegate().then([this, phase] (auto sm) mutable -> streamed_mutation_opt {
-            auto ctx = std::move(_read_context);
-            if (!sm) {
-                if (phase == _cache.phase_of(ctx->range().start()->value())) {
-                    _cache._read_section(_cache._tracker.region(), [this, ctx = std::move(ctx)] {
-                        with_allocator(_cache._tracker.allocator(), [this, &ctx] {
-                            dht::decorated_key dk = ctx->range().start()->value().as_decorated_key();
+        _read_context->enter_partition(_read_context->range().start()->value().as_decorated_key(), src_and_phase.snapshot, phase);
+        return _read_context->create_underlying(false).then([this, phase] {
+          return _read_context->underlying().underlying()().then([this, phase] (auto&& mfopt) {
+            if (!mfopt) {
+                if (phase == _cache.phase_of(_read_context->range().start()->value())) {
+                    _cache._read_section(_cache._tracker.region(), [this] {
+                        with_allocator(_cache._tracker.allocator(), [this] {
+                            dht::decorated_key dk = _read_context->range().start()->value().as_decorated_key();
                            _cache.do_find_or_create_entry(dk, nullptr, [&] (auto i) {
                                mutation_partition mp(_cache._schema);
                                cache_entry* entry = current_allocator().construct<cache_entry>(
@@ -381,19 +372,66 @@ public:
                } else {
                    _cache._tracker.on_mispopulate();
                }
-                return std::move(sm);
-            }
-            if (phase == _cache.phase_of(ctx->range().start()->value())) {
-                return _cache._read_section(_cache._tracker.region(), [&] {
-                    cache_entry& e = _cache.find_or_create(sm->decorated_key(), sm->partition_tombstone(), phase);
-                    return e.read(_cache, *ctx, std::move(*sm), phase);
+                _end_of_stream = true;
+            } else if (phase == _cache.phase_of(_read_context->range().start()->value())) {
+                _reader = _cache._read_section(_cache._tracker.region(), [&] {
+                    cache_entry& e = _cache.find_or_create(mfopt->as_partition_start().key(), mfopt->as_partition_start().partition_tombstone(), phase);
+                    return e.read(_cache, *_read_context, phase);
                });
            } else {
                _cache._tracker.on_mispopulate();
-                return read_directly_from_underlying(std::move(*sm), *ctx);
+                _reader = read_directly_from_underlying(*_read_context);
+                push_mutation_fragment(std::move(*mfopt));
            }
+          });
        });
    }
+public:
+    single_partition_populating_reader(row_cache& cache,
+            lw_shared_ptr<read_context> context)
+        : impl(context->schema())
+        , _cache(cache)
+        , _read_context(std::move(context))
+    { }
+
+    virtual future<> fill_buffer() override {
+        if (!_reader) {
+            return create_reader().then([this] {
+                if (_end_of_stream) {
+                    return make_ready_future<>();
+                }
+                return fill_buffer();
+            });
+        }
+        return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
+            return fill_buffer_from(*_reader).then([this] (bool reader_finished) {
+                if (reader_finished) {
+                    _end_of_stream = true;
+                }
+            });
+        });
+    }
+    virtual void next_partition() override {
+        if (_reader) {
+            clear_buffer();
+            _end_of_stream = true;
+        }
+    }
+    virtual future<> fast_forward_to(const dht::partition_range&) override {
+        clear_buffer();
+        _end_of_stream = true;
+        return make_ready_future<>();
+    }
+    virtual future<> fast_forward_to(position_range pr) override {
+        if (!_reader) {
+            _end_of_stream = true;
+            return make_ready_future<>();
+        }
+        assert(bool(_read_context->fwd()));
+        _end_of_stream = false;
+        forward_buffer_to(pr.start());
+        return _reader->fast_forward_to(std::move(pr));
+    }
 };

 void cache_tracker::clear_continuity(cache_entry& ce) {
@@ -469,25 +507,31 @@ public:
        , _read_context(ctx)
    {}

-    future<streamed_mutation_opt> operator()() {
-        return _reader().then([this] (streamed_mutation_opt smopt) mutable -> streamed_mutation_opt {
+    future<flat_mutation_reader_opt, mutation_fragment_opt > operator()() {
+        return _reader.move_to_next_partition().then([this] (auto&& mfopt) mutable {
            {
-                if (!smopt) {
+                if (!mfopt) {
                    handle_end_of_stream();
-                    return std::move(smopt);
+                    return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(stdx::nullopt, stdx::nullopt);
                }
                _cache.on_partition_miss();
-                if (_reader.creation_phase() == _cache.phase_of(smopt->decorated_key())) {
+                const partition_start& ps = mfopt->as_partition_start();
+                const dht::decorated_key& key = ps.key();
+                if (_reader.creation_phase() == _cache.phase_of(key)) {
                    return _cache._read_section(_cache._tracker.region(), [&] {
-                        cache_entry& e = _cache.find_or_create(smopt->decorated_key(), smopt->partition_tombstone(), _reader.creation_phase(),
-                            can_set_continuity() ? &*_last_key : nullptr);
-                        _last_key = row_cache::previous_entry_pointer(smopt->decorated_key());
-                        return e.read(_cache, _read_context, std::move(*smopt), _reader.creation_phase());
+                        cache_entry& e = _cache.find_or_create(key,
+                                                               ps.partition_tombstone(),
+                                                               _reader.creation_phase(),
+                                                               can_set_continuity() ? &*_last_key : nullptr);
+                        _last_key = row_cache::previous_entry_pointer(key);
+                        return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(
+                            e.read(_cache, _read_context, _reader.creation_phase()), stdx::nullopt);
                    });
                } else {
                    _cache._tracker.on_mispopulate();
-                    _last_key = row_cache::previous_entry_pointer(smopt->decorated_key());
-                    return read_directly_from_underlying(std::move(*smopt), _read_context);
+                    _last_key = row_cache::previous_entry_pointer(key);
+                    return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(
+                        read_directly_from_underlying(_read_context), std::move(mfopt));
                }
            }
        });
@@ -507,7 +551,7 @@ public:
    }
 };

-class scanning_and_populating_reader final : public mutation_reader::impl {
+class scanning_and_populating_reader final : public flat_mutation_reader::impl {
    const dht::partition_range* _pr;
    row_cache& _cache;
    lw_shared_ptr<read_context> _read_context;
@@ -517,8 +561,9 @@ class scanning_and_populating_reader final : public mutation_reader::impl {
    bool _advance_primary = false;
    stdx::optional<dht::partition_range::bound> _lower_bound;
    dht::partition_range _secondary_range;
+    flat_mutation_reader_opt _reader;
 private:
-    streamed_mutation read_from_entry(cache_entry& ce) {
+    flat_mutation_reader read_from_entry(cache_entry& ce) {
        _cache.upgrade_entry(ce);
        _cache._tracker.touch(ce);
        _cache.on_partition_hit();
@@ -530,9 +575,9 @@ private:
                           : dht::ring_position_view::min();
    }

-    streamed_mutation_opt do_read_from_primary() {
+    flat_mutation_reader_opt do_read_from_primary() {
        return _cache._read_section(_cache._tracker.region(), [this] {
-            return with_linearized_managed_bytes([&] () -> streamed_mutation_opt {
+            return with_linearized_managed_bytes([&] () -> flat_mutation_reader_opt {
                bool not_moved = true;
                if (!_primary.valid()) {
                    not_moved = _primary.advance_to(as_ring_position_view(_lower_bound));
@@ -549,11 +594,11 @@ private:
                        return stdx::nullopt;
                    }
                    cache_entry& e = _primary.entry();
-                    auto sm = read_from_entry(e);
+                    auto fr = read_from_entry(e);
                    _lower_bound = dht::partition_range::bound{e.key(), false};
                    // Delay the call to next() so that we don't see stale continuity on next invocation.
                    _advance_primary = true;
-                    return streamed_mutation_opt(std::move(sm));
+                    return flat_mutation_reader_opt(std::move(fr));
                } else {
                    if (_primary.in_range()) {
                        cache_entry& e = _primary.entry();
@@ -578,47 +623,88 @@ private:
        });
    }

-    future<streamed_mutation_opt> read_from_primary() {
-        auto smo = do_read_from_primary();
+    future<flat_mutation_reader_opt> read_from_primary() {
+        auto fro = do_read_from_primary();
        if (!_secondary_in_progress) {
-            return make_ready_future<streamed_mutation_opt>(std::move(smo));
+            return make_ready_future<flat_mutation_reader_opt>(std::move(fro));
        }
        return _secondary_reader.fast_forward_to(std::move(_secondary_range)).then([this] {
            return read_from_secondary();
        });
    }

-    future<streamed_mutation_opt> read_from_secondary() {
-        return _secondary_reader().then([this] (streamed_mutation_opt smopt) {
-            if (smopt) {
-                return make_ready_future<streamed_mutation_opt>(std::move(smopt));
+    future<flat_mutation_reader_opt> read_from_secondary() {
+        return _secondary_reader().then([this] (flat_mutation_reader_opt fropt, mutation_fragment_opt ps) {
+            if (fropt) {
+                if (ps) {
+                    push_mutation_fragment(std::move(*ps));
+                }
+                return make_ready_future<flat_mutation_reader_opt>(std::move(fropt));
            } else {
                _secondary_in_progress = false;
                return read_from_primary();
            }
        });
    }
+    future<> read_next_partition() {
+        return (_secondary_in_progress ? read_from_secondary() : read_from_primary()).then([this] (auto&& fropt) {
+            if (bool(fropt)) {
+                _reader = std::move(fropt);
+            } else {
+                _end_of_stream = true;
+            }
+        });
+    }
+    void on_end_of_stream() {
+        if (_read_context->fwd() == streamed_mutation::forwarding::yes) {
+            _end_of_stream = true;
+        } else {
+            _reader = {};
+        }
+    }
 public:
    scanning_and_populating_reader(row_cache& cache,
                                   const dht::partition_range& range,
                                   lw_shared_ptr<read_context> context)
-        : _pr(&range)
+        : impl(context->schema())
+        , _pr(&range)
        , _cache(cache)
        , _read_context(std::move(context))
        , _primary(cache, range)
        , _secondary_reader(cache, *_read_context)
        , _lower_bound(range.start())
    { }
-
-    future<streamed_mutation_opt> operator()() {
-        if (_secondary_in_progress) {
-            return read_from_secondary();
+    virtual future<> fill_buffer() override {
+        return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
+            if (!_reader) {
+                return read_next_partition();
+            } else {
+                return fill_buffer_from(*_reader).then([this] (bool reader_finished) {
+                    if (reader_finished) {
+                        on_end_of_stream();
+                    }
+                });
+            }
+        });
+    }
+    virtual void next_partition() override {
+        if (_read_context->fwd() == streamed_mutation::forwarding::yes) {
+            if (_reader) {
+                clear_buffer();
+                _reader->next_partition();
+                _end_of_stream = false;
+            }
        } else {
-            return read_from_primary();
+            clear_buffer_to_next_partition();
+            if (_reader && is_buffer_empty()) {
+                _reader->next_partition();
+            }
        }
    }
-
-    future<> fast_forward_to(const dht::partition_range& pr) {
+    virtual future<> fast_forward_to(const dht::partition_range& pr) override {
+        clear_buffer();
+        _reader = {};
+        _end_of_stream = false;
        _secondary_in_progress = false;
        _advance_primary = false;
        _pr = &pr;
@@ -626,11 +712,21 @@ public:
        _lower_bound = pr.start();
        return make_ready_future<>();
    }
+    virtual future<> fast_forward_to(position_range cr) override {
+        forward_buffer_to(cr.start());
+        if (_reader) {
+            _end_of_stream = false;
+            return _reader->fast_forward_to(std::move(cr));
+        } else {
+            _end_of_stream = true;
+            return make_ready_future<>();
+        }
+    }
 };

-mutation_reader
+flat_mutation_reader
 row_cache::make_scanning_reader(const dht::partition_range& range, lw_shared_ptr<read_context> context) {
-    return make_mutation_reader<scanning_and_populating_reader>(*this, range, std::move(context));
+    return make_flat_mutation_reader<scanning_and_populating_reader>(*this, range, std::move(context));
 }

 mutation_reader
@@ -642,33 +738,47 @@ row_cache::make_reader(schema_ptr s,
                       streamed_mutation::forwarding fwd,
                       mutation_reader::forwarding fwd_mr)
 {
-    auto ctx = make_lw_shared<read_context>(*this, std::move(s), range, slice, pc, trace_state, fwd, fwd_mr);
+    return mutation_reader_from_flat_mutation_reader(
+        make_flat_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr));
+}
+
+flat_mutation_reader
+row_cache::make_flat_reader(schema_ptr s,
+                            const dht::partition_range& range,
+                            const query::partition_slice& slice,
+                            const io_priority_class& pc,
+                            tracing::trace_state_ptr trace_state,
+                            streamed_mutation::forwarding fwd,
+                            mutation_reader::forwarding fwd_mr)
+{
+    auto ctx = make_lw_shared<read_context>(*this, s, range, slice, pc, trace_state, fwd, fwd_mr);

    if (!ctx->is_range_query()) {
        return _read_section(_tracker.region(), [&] {
-          return with_linearized_managed_bytes([&] {
-            cache_entry::compare cmp(_schema);
-            auto&& pos = ctx->range().start()->value();
-            auto i = _partitions.lower_bound(pos, cmp);
-            if (i != _partitions.end() && !cmp(pos, i->position())) {
-                cache_entry& e = *i;
-                _tracker.touch(e);
-                upgrade_entry(e);
-                on_partition_hit();
-                return make_reader_returning(e.read(*this, *ctx));
-            } else if (i->continuous()) {
-                return make_empty_reader();
-            } else {
-                on_partition_miss();
-                return make_mutation_reader<single_partition_populating_reader>(*this, std::move(ctx));
-            }
-          });
+            return with_linearized_managed_bytes([&] {
+                cache_entry::compare cmp(_schema);
+                auto&& pos = ctx->range().start()->value();
+                auto i = _partitions.lower_bound(pos, cmp);
+                if (i != _partitions.end() && !cmp(pos, i->position())) {
+                    cache_entry& e = *i;
+                    _tracker.touch(e);
+                    upgrade_entry(e);
+                    on_partition_hit();
+                    return e.read(*this, *ctx);
+                } else if (i->continuous()) {
+                    return make_empty_flat_reader(std::move(s));
+                } else {
+                    on_partition_miss();
+                    return make_flat_mutation_reader<single_partition_populating_reader>(*this, std::move(ctx));
+                }
+            });
        });
    }

    return make_scanning_reader(range, std::move(ctx));
 }

+
 row_cache::~row_cache() {
    with_allocator(_tracker.allocator(), [this] {
        _partitions.clear_and_dispose([this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
@@ -924,11 +1034,14 @@ future<> row_cache::update(external_updater eu, memtable& m) {
            _tracker.touch(entry);
            _tracker.on_merge();
        } else if (cache_i->continuous() || is_present(mem_e.key()) == partition_presence_checker_result::definitely_doesnt_exist) {
-            cache_entry* entry = current_allocator().construct<cache_entry>(
-                mem_e.schema(), std::move(mem_e.key()), std::move(mem_e.partition()));
+            // Partition is absent in underlying. First, insert a neutral partition entry.
+            cache_entry* entry = current_allocator().construct<cache_entry>(cache_entry::evictable_tag(),
+                _schema, dht::decorated_key(mem_e.key()),
+                partition_entry::make_evictable(*_schema, mutation_partition(_schema)));
            entry->set_continuous(cache_i->continuous());
            _tracker.insert(*entry);
            _partitions.insert(cache_i, *entry);
+            entry->partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema());
        }
    });
 }
@@ -1063,35 +1176,29 @@ void row_cache::set_schema(schema_ptr new_schema) noexcept {
    _schema = std::move(new_schema);
 }

-streamed_mutation cache_entry::read(row_cache& rc, read_context& reader) {
+flat_mutation_reader cache_entry::read(row_cache& rc, read_context& reader) {
    auto source_and_phase = rc.snapshot_of(_key);
    reader.enter_partition(_key, source_and_phase.snapshot, source_and_phase.phase);
    return do_read(rc, reader);
 }

-streamed_mutation cache_entry::read(row_cache& rc, read_context& reader,
-        streamed_mutation&& sm, row_cache::phase_type phase) {
-    reader.enter_partition(std::move(sm), phase);
-    try {
-        return do_read(rc, reader);
-    } catch (...) {
-        sm = std::move(reader.get_streamed_mutation());
-        throw;
-    }
+flat_mutation_reader cache_entry::read(row_cache& rc, read_context& reader, row_cache::phase_type phase) {
+    reader.enter_partition(_key, phase);
+    return do_read(rc, reader);
 }

 // Assumes reader is in the corresponding partition
-streamed_mutation cache_entry::do_read(row_cache& rc, read_context& reader) {
+flat_mutation_reader cache_entry::do_read(row_cache& rc, read_context& reader) {
    auto snp = _pe.read(rc._tracker.region(), _schema, reader.phase());
    auto ckr = query::clustering_key_filter_ranges::get_ranges(*_schema, reader.slice(), _key.key());
-    auto sm = make_cache_streamed_mutation(_schema, _key, std::move(ckr), rc, reader.shared_from_this(), std::move(snp));
+    auto r = make_cache_flat_mutation_reader(_schema, _key, std::move(ckr), rc, reader.shared_from_this(), std::move(snp));
    if (reader.schema()->version() != _schema->version()) {
-        sm = transform(std::move(sm), schema_upgrader(reader.schema()));
+        r = transform(std::move(r), schema_upgrader(reader.schema()));
    }
    if (reader.fwd() == streamed_mutation::forwarding::yes) {
-        sm = make_forwardable(std::move(sm));
+        r = make_forwardable(std::move(r));
    }
-    return std::move(sm);
+    return std::move(r);
 }

 const schema_ptr& row_cache::schema() const {
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -36,6 +36,7 @@
 #include "utils/estimated_histogram.hh"
 #include "tracing/trace_state.hh"
 #include <seastar/core/metrics_registration.hh>
+#include "flat_mutation_reader.hh"

 namespace bi = boost::intrusive;

@@ -46,6 +47,7 @@ namespace cache {

 class autoupdating_underlying_reader;
 class cache_streamed_mutation;
+class cache_flat_mutation_reader;
 class read_context;
 class lsa_manager;

@@ -76,13 +78,14 @@ class cache_entry {
    cache_link_type _cache_link;
    friend class size_calculator;

-    streamed_mutation do_read(row_cache&, cache::read_context& reader);
+    flat_mutation_reader do_read(row_cache&, cache::read_context& reader);
 public:
    friend class row_cache;
    friend class cache_tracker;

    struct dummy_entry_tag{};
    struct incomplete_tag{};
+    struct evictable_tag{};

    cache_entry(dummy_entry_tag)
        : _key{dht::token(), partition_key::make_empty()}
@@ -98,30 +101,21 @@ public:
    cache_entry(schema_ptr s, const dht::decorated_key& key, const mutation_partition& p)
        : _schema(std::move(s))
        , _key(key)
-        , _pe(p)
-    {
-        _pe.version()->partition().ensure_last_dummy(*_schema);
-    }
+        , _pe(partition_entry::make_evictable(*_schema, mutation_partition(p)))
+    { }

-    cache_entry(schema_ptr s, dht::decorated_key&& key, mutation_partition&& p) noexcept
-        : _schema(std::move(s))
-        , _key(std::move(key))
-        , _pe(std::move(p))
-    {
-        _pe.version()->partition().ensure_last_dummy(*_schema);
-    }
+    cache_entry(schema_ptr s, dht::decorated_key&& key, mutation_partition&& p)
+        : cache_entry(evictable_tag(), s, std::move(key),
+            partition_entry::make_evictable(*s, std::move(p)))
+    { }

    // It is assumed that pe is fully continuous
-    cache_entry(schema_ptr s, dht::decorated_key&& key, partition_entry&& pe) noexcept
+    // pe must be evictable.
+    cache_entry(evictable_tag, schema_ptr s, dht::decorated_key&& key, partition_entry&& pe) noexcept
        : _schema(std::move(s))
        , _key(std::move(key))
        , _pe(std::move(pe))
-    {
-        // If we can assume that _pe is fully continuous, we don't need to check all versions
-        // to determine what the continuity is.
-        // This doesn't change value and doesn't invalidate iterators, so can be called even with a snapshot.
-        _pe.version()->partition().ensure_last_dummy(*_schema);
-    }
+    { }

    cache_entry(cache_entry&&) noexcept;
    ~cache_entry();
@@ -138,8 +132,8 @@ public:
    partition_entry& partition() { return _pe; }
    const schema_ptr& schema() const { return _schema; }
    schema_ptr& schema() { return _schema; }
-    streamed_mutation read(row_cache&, cache::read_context& reader);
-    streamed_mutation read(row_cache&, cache::read_context& reader, streamed_mutation&& underlying, utils::phased_barrier::phase_type);
+    flat_mutation_reader read(row_cache&, cache::read_context&);
+    flat_mutation_reader read(row_cache&, cache::read_context&, utils::phased_barrier::phase_type);
    bool continuous() const { return _flags._continuous; }
    void set_continuous(bool value) { _flags._continuous = value; }

@@ -191,6 +185,7 @@ public:
    friend class cache::read_context;
    friend class cache::autoupdating_underlying_reader;
    friend class cache::cache_streamed_mutation;
+    friend class cache::cache_flat_mutation_reader;
    struct stats {
        uint64_t partition_hits;
        uint64_t partition_misses;
@@ -272,6 +267,7 @@ public:
    friend class single_partition_populating_reader;
    friend class cache_entry;
    friend class cache::cache_streamed_mutation;
+    friend class cache::cache_flat_mutation_reader;
    friend class cache::lsa_manager;
    friend class cache::read_context;
    friend class partition_range_cursor;
@@ -336,8 +332,8 @@ private:
    logalloc::allocating_section _update_section;
    logalloc::allocating_section _populate_section;
    logalloc::allocating_section _read_section;
-    mutation_reader create_underlying_reader(cache::read_context&, mutation_source&, const dht::partition_range&);
-    mutation_reader make_scanning_reader(const dht::partition_range&, lw_shared_ptr<cache::read_context>);
+    flat_mutation_reader create_underlying_reader(cache::read_context&, mutation_source&, const dht::partition_range&);
+    flat_mutation_reader make_scanning_reader(const dht::partition_range&, lw_shared_ptr<cache::read_context>);
    void on_partition_hit();
    void on_partition_miss();
    void on_row_hit();
@@ -452,6 +448,19 @@ public:
        return make_reader(std::move(s), range, full_slice);
    }

+    flat_mutation_reader make_flat_reader(schema_ptr,
+                                          const dht::partition_range&,
+                                          const query::partition_slice&,
+                                          const io_priority_class& = default_priority_class(),
+                                          tracing::trace_state_ptr trace_state = nullptr,
+                                          streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
+                                          mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no);
+
+    flat_mutation_reader make_flat_reader(schema_ptr s, const dht::partition_range& range = query::full_partition_range) {
+        auto& full_slice = s->full_slice();
+        return make_flat_reader(std::move(s), range, full_slice);
+    }
+
    const stats& stats() const { return _stats; }
 public:
    // Populate cache from given mutation, which must be fully continuous.
@@ -523,3 +532,46 @@ public:
    friend class cache_tracker;
    friend class mark_end_as_continuous;
 };
+
+namespace cache {
+
+class lsa_manager {
+    row_cache &_cache;
+public:
+    lsa_manager(row_cache &cache) : _cache(cache) {}
+
+    template<typename Func>
+    decltype(auto) run_in_read_section(const Func &func) {
+        return _cache._read_section(_cache._tracker.region(), [&func]() {
+            return with_linearized_managed_bytes([&func]() {
+                return func();
+            });
+        });
+    }
+
+    template<typename Func>
+    decltype(auto) run_in_update_section(const Func &func) {
+        return _cache._update_section(_cache._tracker.region(), [&func]() {
+            return with_linearized_managed_bytes([&func]() {
+                return func();
+            });
+        });
+    }
+
+    template<typename Func>
+    void run_in_update_section_with_allocator(Func &&func) {
+        return _cache._update_section(_cache._tracker.region(), [this, &func]() {
+            return with_linearized_managed_bytes([this, &func]() {
+                return with_allocator(_cache._tracker.region().allocator(), [this, &func]() mutable {
+                    return func();
+                });
+            });
+        });
+    }
+
+    logalloc::region &region() { return _cache._tracker.region(); }
+
+    logalloc::allocating_section &read_section() { return _cache._read_section; }
+};
+
+}
--- a/2
+++ b/2
@@ -87,7 +87,7 @@ def get_repo_file(dir):
    for name in files:
        with open(name, 'r') as myfile:
            for line in myfile:
-                match = re.search(".*http.?://.*/scylladb/([^/\s]+)/deb/([^/\s]+)\s.*", line)
+                match = re.search(".*http.?://.*/scylladb/([^/\s]+)/deb/([^/\s]+)[\s/].*", line)
                if match:
                    return match.group(2), match.group(1)
                match = re.search(".*http.?://.*/scylladb/([^/]+)/rpm/[^/]+/([^/\s]+)/.*", line)
--- a/2
+++ b/2
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -89,6 +89,7 @@ static const sstring INDEXES_FEATURE = "INDEXES";
 static const sstring DIGEST_MULTIPARTITION_READ_FEATURE = "DIGEST_MULTIPARTITION_READ";
 static const sstring CORRECT_COUNTER_ORDER_FEATURE = "CORRECT_COUNTER_ORDER";
 static const sstring SCHEMA_TABLES_V3 = "SCHEMA_TABLES_V3";
+static const sstring CORRECT_NON_COMPOUND_RANGE_TOMBSTONES = "CORRECT_NON_COMPOUND_RANGE_TOMBSTONES";

 distributed<storage_service> _the_storage_service;

@@ -133,7 +134,8 @@ sstring storage_service::get_config_supported_features() {
        COUNTERS_FEATURE,
        DIGEST_MULTIPARTITION_READ_FEATURE,
        CORRECT_COUNTER_ORDER_FEATURE,
-        SCHEMA_TABLES_V3
+        SCHEMA_TABLES_V3,
+        CORRECT_NON_COMPOUND_RANGE_TOMBSTONES,
    };
    if (service::get_local_storage_service()._db.local().get_config().experimental()) {
        features.push_back(MATERIALIZED_VIEWS_FEATURE);
@@ -344,6 +346,7 @@ void storage_service::register_features() {
    _digest_multipartition_read_feature = gms::feature(DIGEST_MULTIPARTITION_READ_FEATURE);
    _correct_counter_order_feature = gms::feature(CORRECT_COUNTER_ORDER_FEATURE);
    _schema_tables_v3 = gms::feature(SCHEMA_TABLES_V3);
+    _correct_non_compound_range_tombstones = gms::feature(CORRECT_NON_COMPOUND_RANGE_TOMBSTONES);

    if (_db.local().get_config().experimental()) {
        _materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
@@ -946,7 +949,17 @@ void storage_service::handle_state_removing(inet_address endpoint, std::vector<s
                slogger.warn("{}", err);
                throw std::runtime_error(err);
            }
-            restore_replica_count(endpoint, ep.value()).get();
+            // Kick off streaming commands. No need to wait for
+            // restore_replica_count to complete which can take a long time,
+            // since when it completes, this node will send notification to
+            // tell the removal_coordinator with IP address notify_endpoint
+            // that the restore process is finished on this node. This node
+            // will be removed from _replicating_nodes on the
+            // removal_coordinator.
+            auto notify_endpoint = ep.value();
+            restore_replica_count(endpoint, notify_endpoint).handle_exception([endpoint, notify_endpoint] (auto ep) {
+                slogger.info("Failed to restore_replica_count for node {}, notify_endpoint={} : {}", endpoint, notify_endpoint, ep);
+            });
        }
    } else { // now that the gossiper has told us about this nonexistent member, notify the gossiper to remove it
        if (sstring(gms::versioned_value::REMOVED_TOKEN) == pieces[0]) {
@@ -998,6 +1011,7 @@ void storage_service::on_change(inet_address endpoint, application_state state,
        boost::split(pieces, value.value, boost::is_any_of(sstring(versioned_value::DELIMITER_STR)));
        if (pieces.empty()) {
            slogger.warn("Fail to split status in on_change: endpoint={}, app_state={}, value={}", endpoint, state, value);
+            return;
        }
        sstring move_name = pieces[0];
        if (move_name == sstring(versioned_value::STATUS_BOOTSTRAPPING)) {
@@ -1152,10 +1166,10 @@ void storage_service::set_tokens(std::unordered_set<token> tokens) {
    slogger.debug("Setting tokens to {}", tokens);
    db::system_keyspace::update_tokens(tokens).get();
    auto local_tokens = get_local_tokens().get0();
-    set_gossip_tokens(local_tokens);
    _token_metadata.update_normal_tokens(tokens, get_broadcast_address());
-    set_mode(mode::NORMAL, "node is now in normal status", true);
    replicate_to_all_cores().get();
+    set_gossip_tokens(local_tokens);
+    set_mode(mode::NORMAL, "node is now in normal status", true);
 }

 void storage_service::set_gossip_tokens(const std::unordered_set<dht::token>& local_tokens) {
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -269,6 +269,7 @@ private:
    gms::feature _digest_multipartition_read_feature;
    gms::feature _correct_counter_order_feature;
    gms::feature _schema_tables_v3;
+    gms::feature _correct_non_compound_range_tombstones;
 public:
    void enable_all_features() {
        _range_tombstones_feature.enable();
@@ -279,6 +280,7 @@ public:
        _digest_multipartition_read_feature.enable();
        _correct_counter_order_feature.enable();
        _schema_tables_v3.enable();
+        _correct_non_compound_range_tombstones.enable();
    }

    void finish_bootstrapping() {
@@ -2243,6 +2245,10 @@ public:
    const gms::feature& cluster_supports_schema_tables_v3() const {
        return _schema_tables_v3;
    }
+
+    bool cluster_supports_reading_correctly_serialized_range_tombstones() const {
+        return bool(_correct_non_compound_range_tombstones);
+    }
 };

 inline future<> init_storage_service(distributed<database>& db, sharded<auth::service>& auth_service) {
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -205,7 +205,20 @@ public:
        return attr;
    }
 private:
-    ::mutation_reader setup() {
+    // Default range sstable reader that will only return mutation that belongs to current shard.
+    virtual flat_mutation_reader make_sstable_reader(lw_shared_ptr<sstables::sstable_set> ssts) const {
+        return ::make_local_shard_sstable_reader(_cf.schema(),
+                std::move(ssts),
+                query::full_partition_range,
+                _cf.schema()->full_slice(),
+                service::get_local_compaction_priority(),
+                no_resource_tracking(),
+                nullptr,
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no);
+    }
+
+    flat_mutation_reader setup() {
        auto ssts = make_lw_shared<sstables::sstable_set>(_cf.get_compaction_strategy().make_sstable_set(_cf.schema()));
        auto schema = _cf.schema();
        sstring formatted_msg = "[";
@@ -237,15 +250,7 @@ private:
        _info->cf = schema->cf_name();
        report_start(formatted_msg);

-        return ::make_range_sstable_reader(_cf.schema(),
-                ssts,
-                query::full_partition_range,
-                _cf.schema()->full_slice(),
-                service::get_local_compaction_priority(),
-                no_resource_tracking(),
-                nullptr,
-                ::streamed_mutation::forwarding::no,
-                ::mutation_reader::forwarding::no);
+        return make_sstable_reader(std::move(ssts));
    }

    compaction_info finish(std::chrono::time_point<db_clock> started_at, std::chrono::time_point<db_clock> ended_at) {
@@ -284,8 +289,8 @@ private:
        };
    }

-    virtual std::function<bool(const streamed_mutation& sm)> filter_func() const {
-        return [] (const streamed_mutation& sm) {
+    virtual std::function<bool(const dht::decorated_key&)> filter_func() const {
+        return [] (const dht::decorated_key&) {
            return true;
        };
    }
@@ -368,9 +373,9 @@ public:
        };
    }

-    virtual std::function<bool(const streamed_mutation& sm)> filter_func() const override {
-        return [] (const streamed_mutation& sm) {
-            return dht::shard_of(sm.decorated_key().token()) == engine().cpu_id();
+    virtual std::function<bool(const dht::decorated_key&)> filter_func() const override {
+        return [] (const dht::decorated_key& dk){
+            return dht::shard_of(dk.token()) == engine().cpu_id();
        };
    }

@@ -415,15 +420,15 @@ public:
        clogger.info("Cleaned {}", formatted_msg);
    }

-    std::function<bool(const streamed_mutation& sm)> filter_func() const override {
+    std::function<bool(const dht::decorated_key&)> filter_func() const override {
        dht::token_range_vector owned_ranges = service::get_local_storage_service().get_local_ranges(_cf.schema()->ks_name());

-        return [this, owned_ranges = std::move(owned_ranges)] (const streamed_mutation& sm) {
-            if (dht::shard_of(sm.decorated_key().token()) != engine().cpu_id()) {
+        return [this, owned_ranges = std::move(owned_ranges)] (const dht::decorated_key& dk) {
+            if (dht::shard_of(dk.token()) != engine().cpu_id()) {
                return false;
            }

-            if (!belongs_to_current_node(sm.decorated_key().token(), owned_ranges)) {
+            if (!belongs_to_current_node(dk.token(), owned_ranges)) {
                return false;
            }
            return true;
@@ -446,6 +451,19 @@ public:
        _info->type = compaction_type::Reshard;
    }

+    // Use reader that makes sure no non-local mutation will not be filtered out.
+    flat_mutation_reader make_sstable_reader(lw_shared_ptr<sstables::sstable_set> ssts) const override {
+        return ::make_range_sstable_reader(_cf.schema(),
+                std::move(ssts),
+                query::full_partition_range,
+                _cf.schema()->full_slice(),
+                service::get_local_compaction_priority(),
+                no_resource_tracking(),
+                nullptr,
+                ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no);
+    }
+
    void report_start(const sstring& formatted_msg) const override {
        clogger.info("Resharding {}", formatted_msg);
    }
@@ -498,7 +516,7 @@ future<compaction_info> compaction::run(std::unique_ptr<compaction> c) {

        auto start_time = db_clock::now();
        try {
-            consume_flattened_in_thread(reader, cfc, c->filter_func());
+            reader.consume_in_thread(std::move(cfc), c->filter_func());
        } catch (...) {
            delete_sstables_for_interrupted_compaction(c->_info->new_sstables, c->_info->ks, c->_info->cf);
            c = nullptr; // make sure writers are stopped while running in thread context
--- a/sstables/compaction_strategy.cc
+++ b/sstables/compaction_strategy.cc
@@ -67,7 +67,7 @@ extern logging::logger clogger;
 class incremental_selector_impl {
 public:
    virtual ~incremental_selector_impl() {}
-    virtual std::tuple<dht::token_range, std::vector<shared_sstable>, dht::token> select(const dht::token& token) = 0;
+    virtual std::tuple<dht::token_range, std::vector<shared_sstable>, dht::ring_position> select(const dht::token& token) = 0;
 };

 class sstable_set_impl {
@@ -139,9 +139,9 @@ sstable_set::incremental_selector::incremental_selector(sstable_set::incremental
 sstable_set::incremental_selector::selection
 sstable_set::incremental_selector::select(const dht::token& t) const {
    if (!_current_token_range || !_current_token_range->contains(t, dht::token_comparator())) {
-        std::tie(_current_token_range, _current_sstables, _current_next_token) = _impl->select(t);
+        std::tie(_current_token_range, _current_sstables, _current_next_position) = _impl->select(t);
    }
-    return {_current_sstables, _current_next_token};
+    return {_current_sstables, _current_next_position};
 }

 sstable_set::incremental_selector
@@ -176,8 +176,8 @@ public:
    incremental_selector(const std::vector<shared_sstable>& sstables)
        : _sstables(sstables) {
    }
-    virtual std::tuple<dht::token_range, std::vector<shared_sstable>, dht::token> select(const dht::token& token) override {
-        return std::make_tuple(dht::token_range::make_open_ended_both_sides(), _sstables, dht::maximum_token());
+    virtual std::tuple<dht::token_range, std::vector<shared_sstable>, dht::ring_position> select(const dht::token& token) override {
+        return std::make_tuple(dht::token_range::make_open_ended_both_sides(), _sstables, dht::ring_position::max());
    }
 };

@@ -293,34 +293,41 @@ public:
        , _it(leveled_sstables.begin())
        , _end(leveled_sstables.end()) {
    }
-    virtual std::tuple<dht::token_range, std::vector<shared_sstable>, dht::token> select(const dht::token& token) override {
+    virtual std::tuple<dht::token_range, std::vector<shared_sstable>, dht::ring_position> select(const dht::token& token) override {
        auto pr = dht::partition_range::make(dht::ring_position::starting_at(token), dht::ring_position::ending_at(token));
        auto interval = make_interval(*_schema, std::move(pr));
        auto ssts = _unleveled_sstables;
+        using namespace dht;

-        const auto next_token =  [&] {
-            const auto next = std::next(_it);
-            return next == _end ? dht::maximum_token() : next->first.lower().token();
+        auto inclusiveness = [] (auto& interval) {
+            return boost::icl::is_left_closed(interval.bounds()) ? ring_position::token_bound::start : ring_position::token_bound::end;
        };

-        const auto current_token =  [&] {
-            return _it == _end ? dht::maximum_token() : _it->first.lower().token();
+        const auto next_pos =  [&] {
+            const auto next = std::next(_it);
+            auto& interval = next->first;
+            return next == _end ? ring_position::max() : ring_position(interval.lower().token(), inclusiveness(interval));
+        };
+
+        const auto current_pos =  [&] {
+            auto& interval = _it->first;
+            return _it == _end ? ring_position::max() : ring_position(interval.lower().token(), inclusiveness(interval));
        };

        while (_it != _end) {
            if (boost::icl::contains(_it->first, interval)) {
                ssts.insert(ssts.end(), _it->second.begin(), _it->second.end());
-                return std::make_tuple(to_token_range(_it->first), std::move(ssts), next_token());
+                return std::make_tuple(to_token_range(_it->first), std::move(ssts), next_pos());
            }
            // we don't want to skip current interval if token lies before it.
            if (boost::icl::lower_less(interval, _it->first)) {
                return std::make_tuple(dht::token_range::make({token, true}, {_it->first.lower().token(), false}),
                    std::move(ssts),
-                    current_token());
+                    current_pos());
            }
            _it++;
        }
-        return std::make_tuple(dht::token_range::make_open_ended_both_sides(), std::move(ssts), dht::maximum_token());
+        return std::make_tuple(dht::token_range::make_open_ended_both_sides(), std::move(ssts), ring_position::max());
    }
 };

--- a/sstables/compress.cc
+++ b/sstables/compress.cc
@@ -167,30 +167,31 @@ void compression::segmented_offsets::write(uint64_t bucket_index, uint64_t offse
    std::copy_n(reinterpret_cast<char*>(&value), sizeof(value), _storage[bucket_index].storage.get() + offset_byte);
 }

-void compression::segmented_offsets::update_position_trackers(std::size_t index) const {
+void compression::segmented_offsets::state::update_position_trackers(std::size_t index, uint16_t segment_size_bits,
+        uint32_t segments_per_bucket, uint8_t grouped_offsets) {
    if (_current_index != index - 1) {
        _current_index = index;
-        const uint64_t current_segment_index = _current_index / _grouped_offsets;
-        _current_bucket_segment_index = current_segment_index % _segments_per_bucket;
-        _current_segment_relative_index = _current_index % _grouped_offsets;
-        _current_bucket_index = current_segment_index / _segments_per_bucket;
-        _current_segment_offset_bits = (_current_bucket_segment_index % _segments_per_bucket) * _segment_size_bits;
+        const uint64_t current_segment_index = _current_index / grouped_offsets;
+        _current_bucket_segment_index = current_segment_index % segments_per_bucket;
+        _current_segment_relative_index = _current_index % grouped_offsets;
+        _current_bucket_index = current_segment_index / segments_per_bucket;
+        _current_segment_offset_bits = (_current_bucket_segment_index % segments_per_bucket) * segment_size_bits;
    } else {
        ++_current_index;
        ++_current_segment_relative_index;

        // Crossed segment boundary.
-        if (_current_segment_relative_index == _grouped_offsets) {
+        if (_current_segment_relative_index == grouped_offsets) {
            ++_current_bucket_segment_index;
            _current_segment_relative_index = 0;

            // Crossed bucket boundary.
-            if (_current_bucket_segment_index == _segments_per_bucket) {
+            if (_current_bucket_segment_index == segments_per_bucket) {
                ++_current_bucket_index;
                _current_bucket_segment_index = 0;
                _current_segment_offset_bits = 0;
            } else {
-                _current_segment_offset_bits += _segment_size_bits;
+                _current_segment_offset_bits += segment_size_bits;
            }
        }
    }
@@ -216,41 +217,40 @@ void compression::segmented_offsets::init(uint32_t chunk_size) {
    _segments_per_bucket = params.first.segments_per_bucket;
 }

-uint64_t compression::segmented_offsets::at(std::size_t i) const {
+uint64_t compression::segmented_offsets::at(std::size_t i, compression::segmented_offsets::state& s) const {
    if (i >= _size) {
        throw std::out_of_range(sprint("{}: index {} is out of range", __FUNCTION__, i));
    }

-    update_position_trackers(i);
+    s.update_position_trackers(i, _segment_size_bits, _segments_per_bucket, _grouped_offsets);
+    const uint64_t bucket_base_offset = _storage[s._current_bucket_index].base_offset;
+    const uint64_t segment_base_offset = bucket_base_offset + read(s._current_bucket_index, s._current_segment_offset_bits, _segment_base_offset_size_bits);

-    const uint64_t bucket_base_offset = _storage[_current_bucket_index].base_offset;
-    const uint64_t segment_base_offset = bucket_base_offset + read(_current_bucket_index, _current_segment_offset_bits, _segment_base_offset_size_bits);
-
-    if (_current_segment_relative_index == 0) {
-        return  segment_base_offset;
+    if (s._current_segment_relative_index == 0) {
+        return segment_base_offset;
    }

    return segment_base_offset
-        + read(_current_bucket_index,
-                _current_segment_offset_bits + _segment_base_offset_size_bits + (_current_segment_relative_index - 1) * _segmented_offset_size_bits,
+        + read(s._current_bucket_index,
+                s._current_segment_offset_bits + _segment_base_offset_size_bits + (s._current_segment_relative_index - 1) * _segmented_offset_size_bits,
                _segmented_offset_size_bits);
 }

-void compression::segmented_offsets::push_back(uint64_t offset) {
-    update_position_trackers(_size);
+void compression::segmented_offsets::push_back(uint64_t offset, compression::segmented_offsets::state& s) {
+    s.update_position_trackers(_size, _segment_size_bits, _segments_per_bucket, _grouped_offsets);

-    if (_current_bucket_index == _storage.size()) {
+    if (s._current_bucket_index == _storage.size()) {
        _storage.push_back(bucket{_last_written_offset, std::unique_ptr<char[]>(new char[bucket_size])});
    }

-    const uint64_t bucket_base_offset = _storage[_current_bucket_index].base_offset;
+    const uint64_t bucket_base_offset = _storage[s._current_bucket_index].base_offset;

-    if (_current_segment_relative_index == 0) {
-        write(_current_bucket_index, _current_segment_offset_bits, _segment_base_offset_size_bits, offset - bucket_base_offset);
+    if (s._current_segment_relative_index == 0) {
+        write(s._current_bucket_index, s._current_segment_offset_bits, _segment_base_offset_size_bits, offset - bucket_base_offset);
    } else {
-        const uint64_t segment_base_offset = bucket_base_offset + read(_current_bucket_index, _current_segment_offset_bits, _segment_base_offset_size_bits);
-        write(_current_bucket_index,
-                _current_segment_offset_bits + _segment_base_offset_size_bits + (_current_segment_relative_index - 1) * _segmented_offset_size_bits,
+        const uint64_t segment_base_offset = bucket_base_offset + read(s._current_bucket_index, s._current_segment_offset_bits, _segment_base_offset_size_bits);
+        write(s._current_bucket_index,
+                s._current_segment_offset_bits + _segment_base_offset_size_bits + (s._current_segment_relative_index - 1) * _segmented_offset_size_bits,
                _segmented_offset_size_bits,
                offset - segment_base_offset);
    }
@@ -298,14 +298,14 @@ void compression::set_compressor(compressor c) {
 // the end-of-file position (one past the last byte) MUST not be used. If the
 // caller wants to read from the end of file, it should simply read nothing.
 compression::chunk_and_offset
-compression::locate(uint64_t position) const {
+compression::locate(uint64_t position, const compression::segmented_offsets::accessor& accessor) {
    auto ucl = uncompressed_chunk_length();
    auto chunk_index = position / ucl;
    decltype(ucl) chunk_offset = position % ucl;
-    auto chunk_start = offsets.at(chunk_index);
+    auto chunk_start = accessor.at(chunk_index);
    auto chunk_end = (chunk_index + 1 == offsets.size())
            ? _compressed_file_length
-            : offsets.at(chunk_index + 1);
+            : accessor.at(chunk_index + 1);
    return { chunk_start, chunk_end - chunk_start, chunk_offset };
 }

@@ -452,6 +452,7 @@ size_t compress_max_size_snappy(size_t input_len) {
 class compressed_file_data_source_impl : public data_source_impl {
    stdx::optional<input_stream<char>> _input_stream;
    sstables::compression* _compression_metadata;
+    sstables::compression::segmented_offsets::accessor _offsets;
    uint64_t _underlying_pos;
    uint64_t _pos;
    uint64_t _beg_pos;
@@ -460,6 +461,7 @@ public:
    compressed_file_data_source_impl(file f, sstables::compression* cm,
                uint64_t pos, size_t len, file_input_stream_options options)
            : _compression_metadata(cm)
+            , _offsets(_compression_metadata->offsets.get_accessor())
    {
        _beg_pos = pos;
        if (pos > _compression_metadata->uncompressed_file_length()) {
@@ -478,8 +480,8 @@ public:
        // _beg_pos and _end_pos specify positions in the compressed stream.
        // We need to translate them into a range of uncompressed chunks,
        // and open a file_input_stream to read that range.
-        auto start = _compression_metadata->locate(_beg_pos);
-        auto end = _compression_metadata->locate(_end_pos - 1);
+        auto start = _compression_metadata->locate(_beg_pos, _offsets);
+        auto end = _compression_metadata->locate(_end_pos - 1, _offsets);
        _input_stream = make_file_input_stream(std::move(f),
                start.chunk_start,
                end.chunk_start + end.chunk_len - start.chunk_start,
@@ -491,7 +493,7 @@ public:
        if (_pos >= _end_pos) {
            return make_ready_future<temporary_buffer<char>>();
        }
-        auto addr = _compression_metadata->locate(_pos);
+        auto addr = _compression_metadata->locate(_pos, _offsets);
        // Uncompress the next chunk. We need to skip part of the first
        // chunk, but then continue to read from beginning of chunks.
        if (_pos != _beg_pos && addr.offset != 0) {
@@ -539,7 +541,7 @@ public:
        if (_pos == _end_pos) {
            return make_ready_future<temporary_buffer<char>>();
        }
-        auto addr = _compression_metadata->locate(_pos);
+        auto addr = _compression_metadata->locate(_pos, _offsets);
        auto underlying_n = addr.chunk_start - _underlying_pos;
        _underlying_pos = addr.chunk_start;
        _beg_pos = _pos;
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`options raid0 devices_discard_performance=Y`