Merge 'Add ALLOW FILTERING metrics' from Piotr

" This series addresses issue #3575 by adding 3 ALLOW FILTERING related metrics to help profile queries: * number of read request that required filtering * total number of rows read that required filtering * number of rows read that required filtering and matched Tests: unit (release) " * 'allow_filtering_metrics_4' of https://github.com/psarna/scylla: cql3: publish ALLOW FILTERING metrics cql3: add updating ALLOW FILTERING metrics cql3: define ALLOW FILTERING metrics
cql3: publish ALLOW FILTERING metrics
2018-07-06 11:19:37 +01:00 · 2018-07-06 12:00:37 +02:00 · 2018-07-06 12:00:29 +02:00 · 2018-07-06 10:43:18 +02:00 · 2018-07-05 18:54:14 +03:00 · 2018-07-05 16:38:14 +03:00
170 changed files with 3805 additions and 3371 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../scylla-seastar
+	url = ../seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=2.3.6
+VERSION=666.development

 if test -f version
 then
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -2228,11 +2228,11 @@
               "description":"The column family"
            },
            "total":{
-               "type":"long",
+               "type":"int",
               "description":"The total snapshot size"
            },
            "live":{
-               "type":"long",
+               "type":"int",
               "description":"The live snapshot size"
            }
         }
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -47,6 +47,14 @@ atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_typ
    );
 }

+atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value, atomic_cell::collection_member cm) {
+    auto& imr_data = type.imr_state();
+    return atomic_cell(
+        imr_data.type_info(),
+        imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, bool(cm)), &imr_data.lsa_migrator())
+    );
+}
+
 atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, bytes_view value,
                             gc_clock::time_point expiry, gc_clock::duration ttl, atomic_cell::collection_member cm) {
    auto& imr_data = type.imr_state();
@@ -56,6 +64,15 @@ atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_typ
    );
 }

+atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
+                             gc_clock::time_point expiry, gc_clock::duration ttl, atomic_cell::collection_member cm) {
+    auto& imr_data = type.imr_state();
+    return atomic_cell(
+        imr_data.type_info(),
+        imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, expiry, ttl, bool(cm)), &imr_data.lsa_migrator())
+    );
+}
+
 atomic_cell atomic_cell::make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
    auto& imr_data = no_type_imr_descriptor();
    return atomic_cell(
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -34,6 +34,8 @@
 #include "data/schema_info.hh"
 #include "imr/utils.hh"

+#include "serializer.hh"
+
 class abstract_type;
 class collection_type_impl;

@@ -186,6 +188,8 @@ public:
    static atomic_cell make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time);
    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, bytes_view value,
                                 collection_member = collection_member::no);
+    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
+                                 collection_member = collection_member::no);
    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const bytes& value,
                                 collection_member cm = collection_member::no) {
        return make_live(type, timestamp, bytes_view(value), cm);
@@ -193,6 +197,8 @@ public:
    static atomic_cell make_live_counter_update(api::timestamp_type timestamp, int64_t value);
    static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, bytes_view value,
        gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
+    static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
+        gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const bytes& value,
                                 gc_clock::time_point expiry, gc_clock::duration ttl, collection_member cm = collection_member::no)
    {
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -28,7 +28,6 @@
 #include "database.hh"
 #include "schema_builder.hh"
 #include "service/migration_manager.hh"
-#include "timeout_config.hh"

 namespace auth {

@@ -95,10 +94,4 @@ future<> wait_for_schema_agreement(::service::migration_manager& mm, const datab
    });
 }

-const timeout_config& internal_distributed_timeout_config() noexcept {
-    static const auto t = 5s;
-    static const timeout_config tc{t, t, t, t, t, t, t};
-    return tc;
-}
-
 }
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -38,7 +38,6 @@
 using namespace std::chrono_literals;

 class database;
-class timeout_config;

 namespace service {
 class migration_manager;
@@ -83,9 +82,4 @@ future<> create_metadata_table_if_missing(

 future<> wait_for_schema_agreement(::service::migration_manager&, const database&);

-///
-/// Time-outs for internal, non-local CQL queries.
-///
-const timeout_config& internal_distributed_timeout_config() noexcept;
-
 }
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -228,7 +228,7 @@ default_authorizer::modify(
        return _qp.process(
                query,
                db::consistency_level::ONE,
-                internal_distributed_timeout_config(),
+                infinite_timeout_config,
                {permissions::to_strings(set), sstring(role_name), resource.name()}).discard_result();
    });
 }
@@ -254,7 +254,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
    return _qp.process(
            query,
            db::consistency_level::ONE,
-            internal_distributed_timeout_config(),
+            infinite_timeout_config,
            {},
            true).then([](::shared_ptr<cql3::untyped_result_set> results) {
        std::vector<permission_details> all_details;
@@ -282,7 +282,7 @@ future<> default_authorizer::revoke_all(stdx::string_view role_name) const {
    return _qp.process(
            query,
            db::consistency_level::ONE,
-            internal_distributed_timeout_config(),
+            infinite_timeout_config,
            {sstring(role_name)}).discard_result().handle_exception([role_name](auto ep) {
        try {
            std::rethrow_exception(ep);
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -149,9 +149,7 @@ static sstring gensalt() {
    // blowfish 2011 fix, blowfish, sha512, sha256, md5
    for (sstring pfx : { "$2y$", "$2a$", "$6$", "$5$", "$1$" }) {
        salt = pfx + input;
-        const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
-
-        if (e && (e[0] != '*')) {
+        if (crypt_r("fisk", salt.c_str(), &tlcrypt)) {
            prefix = pfx;
            return salt;
        }
@@ -186,7 +184,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
    return _qp.process(
            query,
            db::consistency_level::QUORUM,
-            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
+            infinite_timeout_config).then([this](::shared_ptr<cql3::untyped_result_set> results) {
        return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
            auto username = row.get_as<sstring>("username");
            auto salted_hash = row.get_as<sstring>(SALTED_HASH);
@@ -194,7 +192,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
            return _qp.process(
                    update_row_query,
                    consistency_for_user(username),
-                    internal_distributed_timeout_config(),
+                    infinite_timeout_config,
                    {std::move(salted_hash), username}).discard_result();
        }).finally([results] {});
    }).then([] {
@@ -211,7 +209,7 @@ future<> password_authenticator::create_default_if_missing() const {
            return _qp.process(
                    update_row_query,
                    db::consistency_level::QUORUM,
-                    internal_distributed_timeout_config(),
+                    infinite_timeout_config,
                    {hashpw(DEFAULT_USER_PASSWORD), DEFAULT_USER_NAME}).then([](auto&&) {
                plogger.info("Created default superuser authentication record.");
            });
@@ -311,17 +309,13 @@ future<authenticated_user> password_authenticator::authenticate(
        return _qp.process(
                query,
                consistency_for_user(username),
-                internal_distributed_timeout_config(),
+                infinite_timeout_config,
                {username},
                true);
    }).then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
        try {
            auto res = f.get0();
-            auto salted_hash = std::experimental::optional<sstring>();
-            if (!res->empty()) {
-                salted_hash = res->one().get_opt<sstring>(SALTED_HASH);
-            }
-            if (!salted_hash || !checkpw(password, *salted_hash)) {
+            if (res->empty() || !checkpw(password, res->one().get_as<sstring>(SALTED_HASH))) {
                throw exceptions::authentication_exception("Username and/or password are incorrect");
            }
            return make_ready_future<authenticated_user>(username);
@@ -343,7 +337,7 @@ future<> password_authenticator::create(stdx::string_view role_name, const authe
    return _qp.process(
            update_row_query,
            consistency_for_user(role_name),
-            internal_distributed_timeout_config(),
+            infinite_timeout_config,
            {hashpw(*options.password), sstring(role_name)}).discard_result();
 }

@@ -361,7 +355,7 @@ future<> password_authenticator::alter(stdx::string_view role_name, const authen
    return _qp.process(
            query,
            consistency_for_user(role_name),
-            internal_distributed_timeout_config(),
+            infinite_timeout_config,
            {hashpw(*options.password), sstring(role_name)}).discard_result();
 }

@@ -372,10 +366,7 @@ future<> password_authenticator::drop(stdx::string_view name) const {
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.process(
-            query, consistency_for_user(name),
-            internal_distributed_timeout_config(),
-            {sstring(name)}).discard_result();
+    return _qp.process(query, consistency_for_user(name), infinite_timeout_config, {sstring(name)}).discard_result();
 }

 future<custom_options> password_authenticator::query_custom_options(stdx::string_view role_name) const {
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -79,7 +79,7 @@ future<bool> default_role_row_satisfies(
                return qp.process(
                        query,
                        db::consistency_level::QUORUM,
-                        internal_distributed_timeout_config(),
+                        infinite_timeout_config,
                        {meta::DEFAULT_SUPERUSER_NAME},
                        true).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
                    if (results->empty()) {
@@ -104,7 +104,7 @@ future<bool> any_nondefault_role_row_satisfies(
        return qp.process(
                query,
                db::consistency_level::QUORUM,
-                internal_distributed_timeout_config()).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
+                infinite_timeout_config).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
                return false;
            }
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -196,10 +196,6 @@ future<> service::start() {
 }

 future<> service::stop() {
-    // Only one of the shards has the listener registered, but let's try to
-    // unregister on each one just to make sure.
-    _migration_manager.unregister_listener(_migration_listener.get());
-
    return _permissions_cache->stop().then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
    });
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -89,7 +89,7 @@ static future<stdx::optional<record>> find_record(cql3::query_processor& qp, std
    return qp.process(
            query,
            consistency_for_role(role_name),
-            internal_distributed_timeout_config(),
+            infinite_timeout_config,
            {sstring(role_name)},
            true).then([](::shared_ptr<cql3::untyped_result_set> results) {
        if (results->empty()) {
@@ -174,7 +174,7 @@ future<> standard_role_manager::create_default_role_if_missing() const {
            return _qp.process(
                    query,
                    db::consistency_level::QUORUM,
-                    internal_distributed_timeout_config(),
+                    infinite_timeout_config,
                    {meta::DEFAULT_SUPERUSER_NAME}).then([](auto&&) {
                log.info("Created default superuser role '{}'.", meta::DEFAULT_SUPERUSER_NAME);
                return make_ready_future<>();
@@ -201,7 +201,7 @@ future<> standard_role_manager::migrate_legacy_metadata() const {
    return _qp.process(
            query,
            db::consistency_level::QUORUM,
-            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
+            infinite_timeout_config).then([this](::shared_ptr<cql3::untyped_result_set> results) {
        return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
            role_config config;
            config.is_superuser = row.get_as<bool>("super");
@@ -263,7 +263,7 @@ future<> standard_role_manager::create_or_replace(stdx::string_view role_name, c
    return _qp.process(
            query,
            consistency_for_role(role_name),
-            internal_distributed_timeout_config(),
+            infinite_timeout_config,
            {sstring(role_name), c.is_superuser, c.can_login},
            true).discard_result();
 }
@@ -307,7 +307,7 @@ standard_role_manager::alter(stdx::string_view role_name, const role_config_upda
                        build_column_assignments(u),
                        meta::roles_table::role_col_name),
                consistency_for_role(role_name),
-                internal_distributed_timeout_config(),
+                infinite_timeout_config,
                {sstring(role_name)}).discard_result();
    });
 }
@@ -327,7 +327,7 @@ future<> standard_role_manager::drop(stdx::string_view role_name) const {
            return _qp.process(
                    query,
                    consistency_for_role(role_name),
-                    internal_distributed_timeout_config(),
+                    infinite_timeout_config,
                    {sstring(role_name)}).then([this, role_name](::shared_ptr<cql3::untyped_result_set> members) {
                return parallel_for_each(
                        members->begin(),
@@ -367,7 +367,7 @@ future<> standard_role_manager::drop(stdx::string_view role_name) const {
            return _qp.process(
                    query,
                    consistency_for_role(role_name),
-                    internal_distributed_timeout_config(),
+                    infinite_timeout_config,
                    {sstring(role_name)}).discard_result();
        };

@@ -394,7 +394,7 @@ standard_role_manager::modify_membership(
        return _qp.process(
                query,
                consistency_for_role(grantee_name),
-                internal_distributed_timeout_config(),
+                infinite_timeout_config,
                {role_set{sstring(role_name)}, sstring(grantee_name)}).discard_result();
    };

@@ -406,7 +406,7 @@ standard_role_manager::modify_membership(
                                "INSERT INTO %s (role, member) VALUES (?, ?)",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
-                        internal_distributed_timeout_config(),
+                        infinite_timeout_config,
                        {sstring(role_name), sstring(grantee_name)}).discard_result();

            case membership_change::remove:
@@ -415,7 +415,7 @@ standard_role_manager::modify_membership(
                                "DELETE FROM %s WHERE role = ? AND member = ?",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
-                        internal_distributed_timeout_config(),
+                        infinite_timeout_config,
                        {sstring(role_name), sstring(grantee_name)}).discard_result();
        }

@@ -516,10 +516,7 @@ future<role_set> standard_role_manager::query_all() const {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

-    return _qp.process(
-            query,
-            db::consistency_level::QUORUM,
-            internal_distributed_timeout_config()).then([](::shared_ptr<cql3::untyped_result_set> results) {
+    return _qp.process(query, db::consistency_level::QUORUM, infinite_timeout_config).then([](::shared_ptr<cql3::untyped_result_set> results) {
        role_set roles;

        std::transform(
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -60,12 +60,11 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
        // - _next_row_in_range = _next.position() < _upper_bound
        // - _last_row points at a direct predecessor of the next row which is going to be read.
        //   Used for populating continuity.
-        // - _population_range_starts_before_all_rows is set accordingly
        reading_from_underlying,

        end_of_stream
    };
-    lw_shared_ptr<partition_snapshot> _snp;
+    partition_snapshot_ptr _snp;
    position_in_partition::tri_compare _position_cmp;

    query::clustering_key_filter_ranges _ck_ranges;
@@ -87,13 +86,6 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
    partition_snapshot_row_cursor _next_row;
    bool _next_row_in_range = false;

-    // True iff current population interval, since the previous clustering row, starts before all clustered rows.
-    // We cannot just look at _lower_bound, because emission of range tombstones changes _lower_bound and
-    // because we mark clustering intervals as continuous when consuming a clustering_row, it would prevent
-    // us from marking the interval as continuous.
-    // Valid when _state == reading_from_underlying.
-    bool _population_range_starts_before_all_rows;
-
    // Whether _lower_bound was changed within current fill_buffer().
    // If it did not then we cannot break out of it (e.g. on preemption) because
    // forward progress is not guaranteed in case iterators are getting constantly invalidated.
@@ -137,7 +129,7 @@ public:
                               dht::decorated_key dk,
                               query::clustering_key_filter_ranges&& crr,
                               lw_shared_ptr<read_context> ctx,
-                               lw_shared_ptr<partition_snapshot> snp,
+                               partition_snapshot_ptr snp,
                               row_cache& cache)
        : flat_mutation_reader::impl(std::move(s))
        , _snp(std::move(snp))
@@ -157,9 +149,6 @@ public:
    cache_flat_mutation_reader(const cache_flat_mutation_reader&) = delete;
    cache_flat_mutation_reader(cache_flat_mutation_reader&&) = delete;
    virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override;
-    virtual ~cache_flat_mutation_reader() {
-        maybe_merge_versions(_snp, _lsa_manager.region(), _lsa_manager.read_section());
-    }
    virtual void next_partition() override {
        clear_buffer_to_next_partition();
        if (is_buffer_empty()) {
@@ -239,7 +228,6 @@ inline
 future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
    if (_state == state::move_to_underlying) {
        _state = state::reading_from_underlying;
-        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
@@ -364,12 +352,12 @@ future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::tim
                }
            });
            return make_ready_future<>();
-        }, timeout);
+        });
 }

 inline
 bool cache_flat_mutation_reader::ensure_population_lower_bound() {
-    if (_population_range_starts_before_all_rows) {
+    if (!_ck_ranges_curr->start()) {
        return true;
    }
    if (!_last_row.refresh(*_snp)) {
@@ -424,7 +412,6 @@ inline
 void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
    if (!can_populate()) {
        _last_row = nullptr;
-        _population_range_starts_before_all_rows = false;
        _read_context->cache().on_mispopulate();
        return;
    }
@@ -458,7 +445,6 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
        with_allocator(standard_allocator(), [&] {
            _last_row = partition_snapshot_row_weakref(*_snp, it, true);
        });
-        _population_range_starts_before_all_rows = false;
    });
 }

@@ -678,7 +664,7 @@ inline flat_mutation_reader make_cache_flat_mutation_reader(schema_ptr s,
                                                            query::clustering_key_filter_ranges crr,
                                                            row_cache& cache,
                                                            lw_shared_ptr<cache::read_context> ctx,
-                                                            lw_shared_ptr<partition_snapshot> snp)
+                                                            partition_snapshot_ptr snp)
 {
    return make_flat_mutation_reader<cache::cache_flat_mutation_reader>(
        std::move(s), std::move(dk), std::move(crr), std::move(ctx), std::move(snp), cache);
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -22,6 +22,7 @@

 #pragma once

+#include <functional>
 #include "keys.hh"
 #include "schema.hh"
 #include "range.hh"
@@ -43,22 +44,20 @@ bound_kind invert_kind(bound_kind k);
 int32_t weight(bound_kind k);

 class bound_view {
+    const static thread_local clustering_key _empty_prefix;
+    std::reference_wrapper<const clustering_key_prefix> _prefix;
+    bound_kind _kind;
 public:
-    const static thread_local clustering_key empty_prefix;
-    const clustering_key_prefix& prefix;
-    bound_kind kind;
    bound_view(const clustering_key_prefix& prefix, bound_kind kind)
-        : prefix(prefix)
-        , kind(kind)
+        : _prefix(prefix)
+        , _kind(kind)
    { }
    bound_view(const bound_view& other) noexcept = default;
-    bound_view& operator=(const bound_view& other) noexcept {
-        if (this != &other) {
-            this->~bound_view();
-            new (this) bound_view(other);
-        }
-        return *this;
-    }
+    bound_view& operator=(const bound_view& other) noexcept = default;
+
+    bound_kind kind() const { return _kind; }
+    const clustering_key_prefix& prefix() const { return _prefix; }
+
    struct tri_compare {
        // To make it assignable and to avoid taking a schema_ptr, we
        // wrap the schema reference.
@@ -82,13 +81,13 @@ public:
            return d1 < d2 ? w1 - (w1 <= 0) : -(w2 - (w2 <= 0));
        }
        int operator()(const bound_view b, const clustering_key_prefix& p) const {
-            return operator()(b.prefix, weight(b.kind), p, 0);
+            return operator()(b._prefix, weight(b._kind), p, 0);
        }
        int operator()(const clustering_key_prefix& p, const bound_view b) const {
-            return operator()(p, 0, b.prefix, weight(b.kind));
+            return operator()(p, 0, b._prefix, weight(b._kind));
        }
        int operator()(const bound_view b1, const bound_view b2) const {
-            return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
+            return operator()(b1._prefix, weight(b1._kind), b2._prefix, weight(b2._kind));
        }
    };
    struct compare {
@@ -101,26 +100,26 @@ public:
            return _cmp(p1, w1, p2, w2) < 0;
        }
        bool operator()(const bound_view b, const clustering_key_prefix& p) const {
-            return operator()(b.prefix, weight(b.kind), p, 0);
+            return operator()(b._prefix, weight(b._kind), p, 0);
        }
        bool operator()(const clustering_key_prefix& p, const bound_view b) const {
-            return operator()(p, 0, b.prefix, weight(b.kind));
+            return operator()(p, 0, b._prefix, weight(b._kind));
        }
        bool operator()(const bound_view b1, const bound_view b2) const {
-            return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
+            return operator()(b1._prefix, weight(b1._kind), b2._prefix, weight(b2._kind));
        }
    };
    bool equal(const schema& s, const bound_view other) const {
-        return kind == other.kind && prefix.equal(s, other.prefix);
+        return _kind == other._kind && _prefix.get().equal(s, other._prefix.get());
    }
    bool adjacent(const schema& s, const bound_view other) const {
-        return invert_kind(other.kind) == kind && prefix.equal(s, other.prefix);
+        return invert_kind(other._kind) == _kind && _prefix.get().equal(s, other._prefix.get());
    }
    static bound_view bottom() {
-        return {empty_prefix, bound_kind::incl_start};
+        return {_empty_prefix, bound_kind::incl_start};
    }
    static bound_view top() {
-        return {empty_prefix, bound_kind::incl_end};
+        return {_empty_prefix, bound_kind::incl_end};
    }
    template<template<typename> typename R>
    GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
@@ -144,13 +143,13 @@ public:
    template<template<typename> typename R>
    GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
    static stdx::optional<typename R<clustering_key_prefix_view>::bound> to_range_bound(const bound_view& bv) {
-        if (&bv.prefix == &empty_prefix) {
+        if (&bv._prefix.get() == &_empty_prefix) {
            return {};
        }
-        bool inclusive = bv.kind != bound_kind::excl_end && bv.kind != bound_kind::excl_start;
-        return {typename R<clustering_key_prefix_view>::bound(bv.prefix.view(), inclusive)};
+        bool inclusive = bv._kind != bound_kind::excl_end && bv._kind != bound_kind::excl_start;
+        return {typename R<clustering_key_prefix_view>::bound(bv._prefix.get().view(), inclusive)};
    }
    friend std::ostream& operator<<(std::ostream& out, const bound_view& b) {
-        return out << "{bound: prefix=" << b.prefix << ", kind=" << b.kind << "}";
+        return out << "{bound: prefix=" << b._prefix.get() << ", kind=" << b._kind << "}";
    }
 };
--- a/compatible_ring_position.hh
+++ b/compatible_ring_position.hh
@@ -1,67 +0,0 @@
-/*
- * Copyright (C) 2016 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "query-request.hh"
-#include <experimental/optional>
-
-// Wraps ring_position so it is compatible with old-style C++: default constructor,
-// stateless comparators, yada yada
-class compatible_ring_position {
-    const schema* _schema = nullptr;
-    // optional to supply a default constructor, no more
-    std::experimental::optional<dht::ring_position> _rp;
-public:
-    compatible_ring_position() noexcept = default;
-    compatible_ring_position(const schema& s, const dht::ring_position& rp)
-            : _schema(&s), _rp(rp) {
-    }
-    compatible_ring_position(const schema& s, dht::ring_position&& rp)
-            : _schema(&s), _rp(std::move(rp)) {
-    }
-    const dht::token& token() const {
-        return _rp->token();
-    }
-    friend int tri_compare(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return x._rp->tri_compare(*x._schema, *y._rp);
-    }
-    friend bool operator<(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) < 0;
-    }
-    friend bool operator<=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) <= 0;
-    }
-    friend bool operator>(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) > 0;
-    }
-    friend bool operator>=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) >= 0;
-    }
-    friend bool operator==(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) == 0;
-    }
-    friend bool operator!=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) != 0;
-    }
-};
-
--- a/compatible_ring_position_view.hh
+++ b/compatible_ring_position_view.hh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "query-request.hh"
+#include <optional>
+
+// Wraps ring_position_view so it is compatible with old-style C++: default
+// constructor, stateless comparators, yada yada.
+class compatible_ring_position_view {
+    const schema* _schema = nullptr;
+    // Optional to supply a default constructor, no more.
+    std::optional<dht::ring_position_view> _rpv;
+public:
+    constexpr compatible_ring_position_view() = default;
+    compatible_ring_position_view(const schema& s, dht::ring_position_view rpv)
+        : _schema(&s), _rpv(rpv) {
+    }
+    const dht::ring_position_view& position() const {
+        return *_rpv;
+    }
+    friend int tri_compare(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return dht::ring_position_tri_compare(*x._schema, *x._rpv, *y._rpv);
+    }
+    friend bool operator<(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) < 0;
+    }
+    friend bool operator<=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) <= 0;
+    }
+    friend bool operator>(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) > 0;
+    }
+    friend bool operator>=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) >= 0;
+    }
+    friend bool operator==(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) == 0;
+    }
+    friend bool operator!=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) != 0;
+    }
+};
+
--- a/configure.py
+++ b/configure.py
@@ -303,7 +303,7 @@ scylla_tests = [
    'tests/imr_test',
    'tests/partition_data_test',
    'tests/reusable_buffer_test',
-    'tests/json_test'
+    'tests/multishard_writer_test',
 ]

 perf_tests = [
@@ -407,7 +407,6 @@ scylla_core = (['database.cc',
                 'mutation_reader.cc',
                 'flat_mutation_reader.cc',
                 'mutation_query.cc',
-                 'json.cc',
                 'keys.cc',
                 'counters.cc',                 
                 'compress.cc',
@@ -516,7 +515,6 @@ scylla_core = (['database.cc',
                 'db/consistency_level.cc',
                 'db/system_keyspace.cc',
                 'db/system_distributed_keyspace.cc',
-                 'db/size_estimates_virtual_reader.cc',
                 'db/schema_tables.cc',
                 'db/cql_type_parser.cc',
                 'db/legacy_schema_migrator.cc',
@@ -632,6 +630,7 @@ scylla_core = (['database.cc',
                 'utils/arch/powerpc/crc32-vpmsum/crc32_wrapper.cc',
                 'querier.cc',
                 'data/cell.cc',
+                 'multishard_writer.cc',
                 ]
                + [Antlr3Grammar('cql3/Cql.g')]
                + [Thrift('interface/cassandra.thrift', 'Cassandra')]
@@ -743,7 +742,6 @@ pure_boost_tests = set([
    'tests/imr_test',
    'tests/partition_data_test',
    'tests/reusable_buffer_test',
-    'tests/json_test',
 ])

 tests_not_using_seastar_test_framework = set([
@@ -795,7 +793,7 @@ deps['tests/log_heap_test'] = ['tests/log_heap_test.cc']
 deps['tests/anchorless_list_test'] = ['tests/anchorless_list_test.cc']
 deps['tests/perf/perf_fast_forward'] += ['release.cc']
 deps['tests/meta_test'] = ['tests/meta_test.cc']
-deps['tests/imr_test'] = ['tests/imr_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
+deps['tests/imr_test'] = ['tests/imr_test.cc']
 deps['tests/reusable_buffer_test'] = ['tests/reusable_buffer_test.cc']

 warnings = [
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -92,6 +92,10 @@ public:
        _p.apply(t);
    }

+    void accept_static_cell(column_id id, atomic_cell cell) {
+        return accept_static_cell(id, atomic_cell_view(cell));
+    }
+
    virtual void accept_static_cell(column_id id, atomic_cell_view cell) override {
        const column_mapping_entry& col = _visited_column_mapping.static_column_at(id);
        const column_definition* def = _p_schema.get_column_definition(col.name());
@@ -119,6 +123,10 @@ public:
        _current_row = &r;
    }

+    void accept_row_cell(column_id id, atomic_cell cell) {
+        return accept_row_cell(id, atomic_cell_view(cell));
+    }
+
    virtual void accept_row_cell(column_id id, atomic_cell_view cell) override {
        const column_mapping_entry& col = _visited_column_mapping.regular_column_at(id);
        const column_definition* def = _p_schema.get_column_definition(col.name());
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -473,9 +473,9 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
        ::shared_ptr<cql3::term::raw> json_value;
    }
    : K_INSERT K_INTO cf=columnFamilyName
-        ('(' c1=cident { column_names.push_back(c1); }  ( ',' cn=cident { column_names.push_back(cn); } )* ')'
-            K_VALUES
-            '(' v1=term { values.push_back(v1); } ( ',' vn=term { values.push_back(vn); } )* ')'
+          '(' c1=cident { column_names.push_back(c1); }  ( ',' cn=cident { column_names.push_back(cn); } )* ')'
+        ( K_VALUES
+              '(' v1=term { values.push_back(v1); } ( ',' vn=term { values.push_back(vn); } )* ')'
            ( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
            ( usingClause[attrs] )?
              {
--- a/cql3/error_collector.hh
+++ b/cql3/error_collector.hh
@@ -67,12 +67,6 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
     */
    const sstring_view _query;

-    /**
-     * An empty bitset to be used as a workaround for AntLR null dereference
-     * bug.
-     */
-    static typename ExceptionBaseType::BitsetListType _empty_bit_list;
-
 public:

    /**
@@ -150,14 +144,6 @@ private:
            break;
        }
        default:
-            // AntLR Exception class has a bug of dereferencing a null
-            // pointer in the displayRecognitionError. The following
-            // if statement makes sure it will not be null before the
-            // call to that function (displayRecognitionError).
-            // bug reference: https://github.com/antlr/antlr3/issues/191
-            if (!ex->get_expectingSet()) {
-                ex->set_expectingSet(&_empty_bit_list);
-            }
            ex->displayRecognitionError(token_names, msg);
        }
        return msg.str();
@@ -359,8 +345,4 @@ private:
 #endif
 };

-template<typename RecognizerType, typename TokenType, typename ExceptionBaseType>
-typename ExceptionBaseType::BitsetListType
-error_collector<RecognizerType,TokenType,ExceptionBaseType>::_empty_bit_list = typename ExceptionBaseType::BitsetListType();
-
 }
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -177,7 +177,7 @@ shared_ptr<function>
 make_to_json_function(data_type t) {
    return make_native_scalar_function<true>("tojson", utf8_type, {t},
            [t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
-        return utf8_type->decompose(t->to_json_string(parameters[0]));
+        return utf8_type->decompose(t->to_json_string(parameters[0].value()));
    });
 }

--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -217,18 +217,19 @@ void query_options::prepare(const std::vector<::shared_ptr<column_specification>
    }

    auto& names = *_names;
-    std::vector<cql3::raw_value_view> ordered_values;
+    std::vector<cql3::raw_value> ordered_values;
    ordered_values.reserve(specs.size());
    for (auto&& spec : specs) {
        auto& spec_name = spec->name->text();
        for (size_t j = 0; j < names.size(); j++) {
            if (names[j] == spec_name) {
-                ordered_values.emplace_back(_value_views[j]);
+                ordered_values.emplace_back(_values[j]);
                break;
            }
        }
    }
-    _value_views = std::move(ordered_values);
+    _values = std::move(ordered_values);
+    fill_value_views();
 }

 void query_options::fill_value_views()
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -206,6 +206,30 @@ query_processor::query_processor(service::storage_proxy& proxy, distributed<data
                            _cql_stats.secondary_index_rows_read,
                            sm::description("Counts a total number of rows read during CQL requests performed using secondary indexes.")),

+                    // read requests that required ALLOW FILTERING
+                    sm::make_derive(
+                            "filtered_read_requests",
+                            _cql_stats.filtered_reads,
+                            sm::description("Counts a total number of CQL read requests that required ALLOW FILTERING. See filtered_rows_read_total to compare how many rows needed to be filtered.")),
+
+                    // rows read with filtering enabled (because ALLOW FILTERING was required)
+                    sm::make_derive(
+                            "filtered_rows_read_total",
+                            _cql_stats.filtered_rows_read_total,
+                            sm::description("Counts a total number of rows read during CQL requests that required ALLOW FILTERING. See filtered_rows_matched_total and filtered_rows_dropped_total for information how accurate filtering queries are.")),
+
+                    // rows read with filtering enabled and accepted by the filter
+                    sm::make_derive(
+                            "filtered_rows_matched_total",
+                            _cql_stats.filtered_rows_matched_total,
+                            sm::description("Counts a number of rows read during CQL requests that required ALLOW FILTERING and accepted by the filter. Number similar to filtered_rows_read_total indicates that filtering is accurate.")),
+
+                    // rows read with filtering enabled and rejected by the filter
+                    sm::make_derive(
+                            "filtered_rows_dropped_total",
+                            [this]() {return _cql_stats.filtered_rows_read_total - _cql_stats.filtered_rows_matched_total;},
+                            sm::description("Counts a number of rows read during CQL requests that required ALLOW FILTERING and dropped by the filter. Number similar to filtered_rows_read_total indicates that filtering is not accurate and might cause performance degradation.")),
+
                    sm::make_derive(
                            "authorized_prepared_statements_cache_evictions",
                            [] { return authorized_prepared_statements_cache::shard_stats().authorized_prepared_statements_cache_evictions; },
@@ -239,11 +263,11 @@ query_processor::process(const sstring_view& query_string, service::query_state&
    log.trace("process: \"{}\"", query_string);
    tracing::trace(query_state.get_trace_state(), "Parsing a statement");
    auto p = get_statement(query_string, query_state.get_client_state());
+    options.prepare(p->bound_names);
    auto cql_statement = p->statement;
    if (cql_statement->get_bound_terms() != options.get_values_count()) {
        throw exceptions::invalid_request_exception("Invalid amount of bind variables");
    }
-    options.prepare(p->bound_names);

    warn(unimplemented::cause::METRICS);
 #if 0
--- a/cql3/restrictions/primary_key_restrictions.hh
+++ b/cql3/restrictions/primary_key_restrictions.hh
@@ -95,7 +95,32 @@ public:
    uint32_t size() const override {
        return uint32_t(get_column_defs().size());
    }
+
+    bool has_unrestricted_components(const schema& schema) const;
+
+    virtual bool needs_filtering(const schema& schema) const;
 };

+template<>
+inline bool primary_key_restrictions<partition_key>::has_unrestricted_components(const schema& schema) const {
+    return size() < schema.partition_key_size();
+}
+
+template<>
+inline bool primary_key_restrictions<clustering_key>::has_unrestricted_components(const schema& schema) const {
+    return size() < schema.clustering_key_size();
+}
+
+template<>
+inline bool primary_key_restrictions<partition_key>::needs_filtering(const schema& schema) const  {
+    return !empty() && !is_on_token() && (has_unrestricted_components(schema) || is_contains() || is_slice());
+}
+
+template<>
+inline bool primary_key_restrictions<clustering_key>::needs_filtering(const schema& schema) const  {
+    // Currently only overloaded single_column_primary_key_restrictions will require ALLOW FILTERING
+    return false;
+}
+
 }
 }
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -314,6 +314,10 @@ public:
        fail(unimplemented::cause::LEGACY_COMPOSITE_KEYS); // not 100% correct...
    }

+    const single_column_restrictions::restrictions_map& restrictions() const {
+        return _restrictions->restrictions();
+    }
+
    virtual bool has_supporting_index(const secondary_index::secondary_index_manager& index_manager) const override {
        return _restrictions->has_supporting_index(index_manager);
    }
@@ -349,6 +353,8 @@ public:
            _restrictions->restrictions() | boost::adaptors::map_values,
            [&] (auto&& r) { return r->is_satisfied_by(schema, key, ckey, cells, options, now); });
    }
+
+    virtual bool needs_filtering(const schema& schema) const override;
 };

 template<>
@@ -406,6 +412,29 @@ single_column_primary_key_restrictions<clustering_key_prefix>::bounds_ranges(con
    return bounds;
 }

+template<>
+bool single_column_primary_key_restrictions<partition_key>::needs_filtering(const schema& schema) const {
+    return primary_key_restrictions<partition_key>::needs_filtering(schema);
+}
+
+template<>
+bool single_column_primary_key_restrictions<clustering_key>::needs_filtering(const schema& schema) const {
+    // Restrictions currently need filtering in three cases:
+    // 1. any of them is a CONTAINS restriction
+    // 2. restrictions do not form a contiguous prefix (i.e. there are gaps in it)
+    // 3. a SLICE restriction isn't on a last place
+    column_id position = 0;
+    for (const auto& restriction : _restrictions->restrictions() | boost::adaptors::map_values) {
+        if (restriction->is_contains() || position != restriction->get_column_def().id) {
+            return true;
+        }
+        if (!restriction->is_slice()) {
+            position = restriction->get_column_def().id + 1;
+        }
+    }
+    return false;
+}
+
 }
 }

--- a/cql3/restrictions/single_column_restriction.hh
+++ b/cql3/restrictions/single_column_restriction.hh
@@ -93,6 +93,8 @@ public:
    }

    virtual bool is_supported_by(const secondary_index::index& index) const = 0;
+    using abstract_restriction::is_satisfied_by;
+    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const = 0;
 #if 0
    /**
     * Check if this type of restriction is supported by the specified index.
@@ -166,6 +168,7 @@ public:
                                 const row& cells,
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
+    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;

 #if 0
        @Override
@@ -201,15 +204,8 @@ public:
                                 const row& cells,
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
+    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;

-    virtual std::vector<bytes_opt> values_raw(const query_options& options) const = 0;
-
-    virtual std::vector<bytes_opt> values(const query_options& options) const override {
-        std::vector<bytes_opt> ret = values_raw(options);
-        std::sort(ret.begin(),ret.end());
-        ret.erase(std::unique(ret.begin(),ret.end()),ret.end());
-        return ret;
-    }
 #if 0
    @Override
    protected final boolean isSupportedBy(SecondaryIndex index)
@@ -232,7 +228,7 @@ public:
        return abstract_restriction::term_uses_function(_values, ks_name, function_name);
    }

-    virtual std::vector<bytes_opt> values_raw(const query_options& options) const override {
+    virtual std::vector<bytes_opt> values(const query_options& options) const override {
        std::vector<bytes_opt> ret;
        for (auto&& v : _values) {
            ret.emplace_back(to_bytes_opt(v->bind_and_get(options)));
@@ -257,7 +253,7 @@ public:
        return false;
    }

-    virtual std::vector<bytes_opt> values_raw(const query_options& options) const override {
+    virtual std::vector<bytes_opt> values(const query_options& options) const override {
        auto&& lval = dynamic_pointer_cast<multi_item_terminal>(_marker->bind(options));
        if (!lval) {
            throw exceptions::invalid_request_exception("Invalid null value for IN restriction");
@@ -364,6 +360,7 @@ public:
                                 const row& cells,
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
+    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
 };

 // This holds CONTAINS, CONTAINS_KEY, and map[key] = value restrictions because we might want to have any combination of them.
@@ -485,6 +482,7 @@ public:
                                 const row& cells,
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
+    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;

 #if 0
        private List<ByteBuffer> keys(const query_options& options) {
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -23,6 +23,7 @@
 #include <boost/range/algorithm/transform.hpp>
 #include <boost/range/algorithm.hpp>
 #include <boost/range/adaptors.hpp>
+#include <boost/algorithm/cxx11/any_of.hpp>

 #include "statement_restrictions.hh"
 #include "single_column_primary_key_restrictions.hh"
@@ -36,6 +37,8 @@
 namespace cql3 {
 namespace restrictions {

+static logging::logger rlogger("restrictions");
+
 using boost::adaptors::filtered;
 using boost::adaptors::transformed;

@@ -202,7 +205,7 @@ statement_restrictions::statement_restrictions(database& db,
                    throw exceptions::invalid_request_exception(sprint("restriction '%s' is only supported in materialized view creation", relation->to_string()));
                }
            } else {
-                add_restriction(relation->to_restriction(db, schema, bound_names));
+                add_restriction(relation->to_restriction(db, schema, bound_names), for_view, allow_filtering);
            }
        }
    }
@@ -214,11 +217,11 @@ statement_restrictions::statement_restrictions(database& db,
            || _nonprimary_key_restrictions->has_supporting_index(sim);

    // At this point, the select statement if fully constructed, but we still have a few things to validate
-    process_partition_key_restrictions(has_queriable_index, for_view);
+    process_partition_key_restrictions(has_queriable_index, for_view, allow_filtering);

    // Some but not all of the partition key columns have been specified;
    // hence we need turn these restrictions into index expressions.
-    if (_uses_secondary_indexing) {
+    if (_uses_secondary_indexing || _partition_key_restrictions->needs_filtering(*_schema)) {
        _index_restrictions.push_back(_partition_key_restrictions);
    }

@@ -234,13 +237,14 @@ statement_restrictions::statement_restrictions(database& db,
        }
    }

-    process_clustering_columns_restrictions(has_queriable_index, select_a_collection, for_view);
+    process_clustering_columns_restrictions(has_queriable_index, select_a_collection, for_view, allow_filtering);

    // Covers indexes on the first clustering column (among others).
-    if (_is_key_range && has_queriable_clustering_column_index)
-    _uses_secondary_indexing = true;
+    if (_is_key_range && has_queriable_clustering_column_index) {
+        _uses_secondary_indexing = true;
+    }

-    if (_uses_secondary_indexing) {
+    if (_uses_secondary_indexing || _clustering_columns_restrictions->needs_filtering(*_schema)) {
        _index_restrictions.push_back(_clustering_columns_restrictions);
    } else if (_clustering_columns_restrictions->is_contains()) {
        fail(unimplemented::cause::INDEXES);
@@ -269,31 +273,48 @@ statement_restrictions::statement_restrictions(database& db,
        uses_secondary_indexing = true;
 #endif
    }
-    // Even if uses_secondary_indexing is false at this point, we'll still have to use one if
-    // there is restrictions not covered by the PK.
+
    if (!_nonprimary_key_restrictions->empty()) {
-        _uses_secondary_indexing = true;
+        if (has_queriable_index) {
+            _uses_secondary_indexing = true;
+        } else if (!allow_filtering) {
+            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
+                "thus may have unpredictable performance. If you want to execute "
+                "this query despite the performance unpredictability, use ALLOW FILTERING");
+        }
        _index_restrictions.push_back(_nonprimary_key_restrictions);
    }

-    if (_uses_secondary_indexing && !for_view) {
+    if (_uses_secondary_indexing && !(for_view || allow_filtering)) {
        validate_secondary_index_selections(selects_only_static_columns);
    }
 }

-void statement_restrictions::add_restriction(::shared_ptr<restriction> restriction) {
+void statement_restrictions::add_restriction(::shared_ptr<restriction> restriction, bool for_view, bool allow_filtering) {
    if (restriction->is_multi_column()) {
        _clustering_columns_restrictions = _clustering_columns_restrictions->merge_to(_schema, restriction);
    } else if (restriction->is_on_token()) {
        _partition_key_restrictions = _partition_key_restrictions->merge_to(_schema, restriction);
    } else {
-        add_single_column_restriction(::static_pointer_cast<single_column_restriction>(restriction));
+        add_single_column_restriction(::static_pointer_cast<single_column_restriction>(restriction), for_view, allow_filtering);
    }
 }

-void statement_restrictions::add_single_column_restriction(::shared_ptr<single_column_restriction> restriction) {
+void statement_restrictions::add_single_column_restriction(::shared_ptr<single_column_restriction> restriction, bool for_view, bool allow_filtering) {
    auto& def = restriction->get_column_def();
    if (def.is_partition_key()) {
+        // A SELECT query may not request a slice (range) of partition keys
+        // without using token(). This is because there is no way to do this
+        // query efficiently: mumur3 turns a contiguous range of partition
+        // keys into tokens all over the token space.
+        // However, in a SELECT statement used to define a materialized view,
+        // such a slice is fine - it is used to check whether individual
+        // partitions, match, and does not present a performance problem.
+        assert(!restriction->is_on_token());
+        if (restriction->is_slice() && !for_view && !allow_filtering) {
+            throw exceptions::invalid_request_exception(
+                    "Only EQ and IN relation are supported on the partition key (unless you use the token() function or allow filtering)");
+        }
        _partition_key_restrictions = _partition_key_restrictions->merge_to(_schema, restriction);
    } else if (def.is_clustering_key()) {
        _clustering_columns_restrictions = _clustering_columns_restrictions->merge_to(_schema, restriction);
@@ -312,7 +333,7 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
    return _index_restrictions;
 }

-void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view) {
+void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
    // If there is a queriable index, no special condition are required on the other restrictions.
    // But we still need to know 2 things:
    // - If we don't have a queriable index, is the query ok
@@ -321,39 +342,32 @@ void statement_restrictions::process_partition_key_restrictions(bool has_queriab
    // components must have a EQ. Only the last partition key component can be in IN relation.
    if (_partition_key_restrictions->is_on_token()) {
        _is_key_range = true;
-    } else if (has_partition_key_unrestricted_components()) {
-        if (!_partition_key_restrictions->empty() && !for_view) {
-            if (!has_queriable_index) {
-                throw exceptions::invalid_request_exception(sprint("Partition key parts: %s must be restricted as other parts are",
-                    join(", ", get_partition_key_unrestricted_components())));
-            }
-        }
-
+    } else if (_partition_key_restrictions->has_unrestricted_components(*_schema)) {
        _is_key_range = true;
        _uses_secondary_indexing = has_queriable_index;
    }
-    if (_partition_key_restrictions->is_slice() && !_partition_key_restrictions->is_on_token() && !for_view) {
-        // A SELECT query may not request a slice (range) of partition keys
-        // without using token(). This is because there is no way to do this
-        // query efficiently: mumur3 turns a contiguous range of partition
-        // keys into tokens all over the token space.
-        // However, in a SELECT statement used to define a materialized view,
-        // such a slice is fine - it is used to check whether individual
-        // partitions, match, and does not present a performance problem.
-        throw exceptions::invalid_request_exception(
-                "Only EQ and IN relation are supported on the partition key (unless you use the token() function)");
+
+    if (_partition_key_restrictions->needs_filtering(*_schema)) {
+        if (!allow_filtering && !for_view && !has_queriable_index) {
+            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
+                "thus may have unpredictable performance. If you want to execute "
+                "this query despite the performance unpredictability, use ALLOW FILTERING");
+        }
+        _is_key_range = true;
+        _uses_secondary_indexing = has_queriable_index;
    }
+
 }

 bool statement_restrictions::has_partition_key_unrestricted_components() const {
-    return _partition_key_restrictions->size() < _schema->partition_key_size();
+    return _partition_key_restrictions->has_unrestricted_components(*_schema);
 }

 bool statement_restrictions::has_unrestricted_clustering_columns() const {
-    return _clustering_columns_restrictions->size() < _schema->clustering_key_size();
+    return _clustering_columns_restrictions->has_unrestricted_components(*_schema);
 }

-void statement_restrictions::process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view) {
+void statement_restrictions::process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view, bool allow_filtering) {
    if (!has_clustering_columns_restriction()) {
        return;
    }
@@ -362,38 +376,36 @@ void statement_restrictions::process_clustering_columns_restrictions(bool has_qu
        throw exceptions::invalid_request_exception(
            "Cannot restrict clustering columns by IN relations when a collection is selected by the query");
    }
-    if (_clustering_columns_restrictions->is_contains() && !has_queriable_index) {
+    if (_clustering_columns_restrictions->is_contains() && !has_queriable_index && !allow_filtering) {
        throw exceptions::invalid_request_exception(
-            "Cannot restrict clustering columns by a CONTAINS relation without a secondary index");
+            "Cannot restrict clustering columns by a CONTAINS relation without a secondary index or filtering");
    }

-    auto clustering_columns_iter = _schema->clustering_key_columns().begin();
-
-    for (auto&& restricted_column : _clustering_columns_restrictions->get_column_defs()) {
-        const column_definition* clustering_column = &(*clustering_columns_iter);
-        ++clustering_columns_iter;
-
-        if (clustering_column != restricted_column && !for_view) {
-            if (!has_queriable_index) {
-                throw exceptions::invalid_request_exception(sprint(
-                    "PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
-                    restricted_column->name_as_text(), clustering_column->name_as_text()));
+    if (has_clustering_columns_restriction() && _clustering_columns_restrictions->needs_filtering(*_schema)) {
+        if (has_queriable_index) {
+            _uses_secondary_indexing = true;
+        } else if (!allow_filtering && !for_view) {
+            auto clustering_columns_iter = _schema->clustering_key_columns().begin();
+            for (auto&& restricted_column : _clustering_columns_restrictions->get_column_defs()) {
+                const column_definition* clustering_column = &(*clustering_columns_iter);
+                ++clustering_columns_iter;
+                if (clustering_column != restricted_column) {
+                        throw exceptions::invalid_request_exception(sprint(
+                            "PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
+                            restricted_column->name_as_text(), clustering_column->name_as_text()));
+                }
            }
-
-            _uses_secondary_indexing = true; // handle gaps and non-keyrange cases.
-            break;
        }
    }
-
-    if (_clustering_columns_restrictions->is_contains()) {
-        _uses_secondary_indexing = true;
-    }
 }

 dht::partition_range_vector statement_restrictions::get_partition_key_ranges(const query_options& options) const {
    if (_partition_key_restrictions->empty()) {
        return {dht::partition_range::make_open_ended_both_sides()};
    }
+    if (_partition_key_restrictions->needs_filtering(*_schema)) {
+        return {dht::partition_range::make_open_ended_both_sides()};
+    }
    return _partition_key_restrictions->bounds_ranges(options);
 }

@@ -401,18 +413,30 @@ std::vector<query::clustering_range> statement_restrictions::get_clustering_boun
    if (_clustering_columns_restrictions->empty()) {
        return {query::clustering_range::make_open_ended_both_sides()};
    }
+    // TODO(sarna): For filtering to work, clustering range is not bounded at all. For filtering to work faster,
+    // the biggest clustering prefix restriction should be used here.
+    if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
+        return {query::clustering_range::make_open_ended_both_sides()};
+    }
    return _clustering_columns_restrictions->bounds_ranges(options);
 }

-bool statement_restrictions::need_filtering() {
+bool statement_restrictions::need_filtering() const {
    uint32_t number_of_restricted_columns = 0;
    for (auto&& restrictions : _index_restrictions) {
        number_of_restricted_columns += restrictions->size();
    }

+    if (_partition_key_restrictions->is_multi_column() || _clustering_columns_restrictions->is_multi_column()) {
+        // TODO(sarna): Implement ALLOW FILTERING support for multi-column restrictions - return false for now
+        // in order to ensure backwards compatibility
+        return false;
+    }
+
    return number_of_restricted_columns > 1
-           || (number_of_restricted_columns == 0 && has_clustering_columns_restriction())
-           || (number_of_restricted_columns != 0 && _nonprimary_key_restrictions->has_multiple_contains());
+            || (number_of_restricted_columns == 0 && _partition_key_restrictions->empty() && !_clustering_columns_restrictions->empty())
+            || (number_of_restricted_columns != 0 && _nonprimary_key_restrictions->has_multiple_contains())
+            || (number_of_restricted_columns != 0 && !_uses_secondary_indexing);
 }

 void statement_restrictions::validate_secondary_index_selections(bool selects_only_static_columns) {
@@ -430,6 +454,33 @@ void statement_restrictions::validate_secondary_index_selections(bool selects_on
    }
 }

+const single_column_restrictions::restrictions_map& statement_restrictions::get_single_column_partition_key_restrictions() const {
+    static single_column_restrictions::restrictions_map empty;
+    auto single_restrictions = dynamic_pointer_cast<single_column_primary_key_restrictions<partition_key>>(_partition_key_restrictions);
+    if (!single_restrictions) {
+        if (dynamic_pointer_cast<initial_key_restrictions<partition_key>>(_partition_key_restrictions)) {
+            return empty;
+        }
+        throw std::runtime_error("statement restrictions for multi-column partition key restrictions are not implemented yet");
+    }
+    return single_restrictions->restrictions();
+}
+
+/**
+ * @return clustering key restrictions split into single column restrictions (e.g. for filtering support).
+ */
+const single_column_restrictions::restrictions_map& statement_restrictions::get_single_column_clustering_key_restrictions() const {
+    static single_column_restrictions::restrictions_map empty;
+    auto single_restrictions = dynamic_pointer_cast<single_column_primary_key_restrictions<clustering_key>>(_clustering_columns_restrictions);
+    if (!single_restrictions) {
+        if (dynamic_pointer_cast<initial_key_restrictions<clustering_key>>(_clustering_columns_restrictions)) {
+            return empty;
+        }
+        throw std::runtime_error("statement restrictions for multi-column partition key restrictions are not implemented yet");
+    }
+    return single_restrictions->restrictions();
+}
+
 static std::optional<atomic_cell_value_view> do_get_value(const schema& schema,
        const column_definition& cdef,
        const partition_key& key,
@@ -482,6 +533,14 @@ bool single_column_restriction::EQ::is_satisfied_by(const schema& schema,
    return false;
 }

+bool single_column_restriction::EQ::is_satisfied_by(bytes_view data, const query_options& options) const {
+    if (_column_def.type->is_counter()) {
+        fail(unimplemented::cause::COUNTERS);
+    }
+    auto operand = value(options);
+    return operand && _column_def.type->compare(*operand, data) == 0;
+}
+
 bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
        const partition_key& key,
        const clustering_key_prefix& ckey,
@@ -503,6 +562,16 @@ bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
  });
 }

+bool single_column_restriction::IN::is_satisfied_by(bytes_view data, const query_options& options) const {
+    if (_column_def.type->is_counter()) {
+        fail(unimplemented::cause::COUNTERS);
+    }
+    auto operands = values(options);
+    return boost::algorithm::any_of(operands, [this, &data] (const bytes_opt& operand) {
+        return operand && _column_def.type->compare(*operand, data) == 0;
+    });
+}
+
 static query::range<bytes_view> to_range(const term_slice& slice, const query_options& options) {
    using range_type = query::range<bytes_view>;
    auto extract_bound = [&] (statements::bound bound) -> stdx::optional<range_type::bound> {
@@ -538,6 +607,13 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
    });
 }

+bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
+    if (_column_def.type->is_counter()) {
+        fail(unimplemented::cause::COUNTERS);
+    }
+    return to_range(_slice, options).contains(data, _column_def.type->as_tri_comparator());
+}
+
 bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
        const partition_key& key,
        const clustering_key_prefix& ckey,
@@ -680,6 +756,11 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
    return true;
 }

+bool single_column_restriction::contains::is_satisfied_by(bytes_view data, const query_options& options) const {
+    //TODO(sarna): Deserialize & return. It would be nice to deduplicate, is_satisfied_by above is rather long
+    fail(unimplemented::cause::INDEXES);
+}
+
 bool token_restriction::EQ::is_satisfied_by(const schema& schema,
        const partition_key& key,
        const clustering_key_prefix& ckey,
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -120,8 +120,8 @@ public:
        bool for_view = false,
        bool allow_filtering = false);
 private:
-    void add_restriction(::shared_ptr<restriction> restriction);
-    void add_single_column_restriction(::shared_ptr<single_column_restriction> restriction);
+    void add_restriction(::shared_ptr<restriction> restriction, bool for_view, bool allow_filtering);
+    void add_single_column_restriction(::shared_ptr<single_column_restriction> restriction, bool for_view, bool allow_filtering);
 public:
    bool uses_function(const sstring& ks_name, const sstring& function_name) const;

@@ -175,7 +175,7 @@ public:
     */
    bool has_unrestricted_clustering_columns() const;
 private:
-    void process_partition_key_restrictions(bool has_queriable_index, bool for_view);
+    void process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering);

    /**
     * Returns the partition key components that are not restricted.
@@ -190,7 +190,7 @@ private:
     * @param select_a_collection <code>true</code> if the query should return a collection column
     * @throws InvalidRequestException if the request is invalid
     */
-    void process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view);
+    void process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view, bool allow_filtering);

    /**
     * Returns the <code>Restrictions</code> for the specified type of columns.
@@ -358,7 +358,7 @@ public:
     * Checks if the query need to use filtering.
     * @return <code>true</code> if the query need to use filtering, <code>false</code> otherwise.
     */
-    bool need_filtering();
+    bool need_filtering() const;

    void validate_secondary_index_selections(bool selects_only_static_columns);

@@ -399,6 +399,16 @@ public:
    const single_column_restrictions::restrictions_map& get_non_pk_restriction() const {
        return _nonprimary_key_restrictions->restrictions();
    }
+
+    /**
+     * @return partition key restrictions split into single column restrictions (e.g. for filtering support).
+     */
+    const single_column_restrictions::restrictions_map& get_single_column_partition_key_restrictions() const;
+
+    /**
+     * @return clustering key restrictions split into single column restrictions (e.g. for filtering support).
+     */
+    const single_column_restrictions::restrictions_map& get_single_column_clustering_key_restrictions() const;
 };

 }
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -330,93 +330,86 @@ std::unique_ptr<result_set> result_set_builder::build() {
    return std::move(_result_set);
 }

-result_set_builder::visitor::visitor(
-        cql3::selection::result_set_builder& builder, const schema& s,
-        const selection& selection)
-        : _builder(builder), _schema(s), _selection(selection), _row_count(0) {
-}
+bool result_set_builder::restrictions_filter::operator()(const selection& selection,
+                                                         const std::vector<bytes>& partition_key,
+                                                         const std::vector<bytes>& clustering_key,
+                                                         const query::result_row_view& static_row,
+                                                         const query::result_row_view& row) const {
+    static logging::logger rlogger("restrictions_filter");

-void result_set_builder::visitor::add_value(const column_definition& def,
-        query::result_row_view::iterator_type& i) {
-    if (def.type->is_multi_cell()) {
-        auto cell = i.next_collection_cell();
-        if (!cell) {
-            _builder.add_empty();
-            return;
-        }
-        _builder.add_collection(def, cell->linearize());
-    } else {
-        auto cell = i.next_atomic_cell();
-        if (!cell) {
-            _builder.add_empty();
-            return;
-        }
-        _builder.add(def, *cell);
+    if (_current_pratition_key_does_not_match || _current_static_row_does_not_match) {
+        return false;
    }
-}

-void result_set_builder::visitor::accept_new_partition(const partition_key& key,
-        uint32_t row_count) {
-    _partition_key = key.explode(_schema);
-    _row_count = row_count;
-}
-
-void result_set_builder::visitor::accept_new_partition(uint32_t row_count) {
-    _row_count = row_count;
-}
-
-void result_set_builder::visitor::accept_new_row(const clustering_key& key,
-        const query::result_row_view& static_row,
-        const query::result_row_view& row) {
-    _clustering_key = key.explode(_schema);
-    accept_new_row(static_row, row);
-}
-
-void result_set_builder::visitor::accept_new_row(
-        const query::result_row_view& static_row,
-        const query::result_row_view& row) {
    auto static_row_iterator = static_row.iterator();
    auto row_iterator = row.iterator();
-    _builder.new_row();
-    for (auto&& def : _selection.get_columns()) {
-        switch (def->kind) {
-        case column_kind::partition_key:
-            _builder.add(_partition_key[def->component_index()]);
-            break;
-        case column_kind::clustering_key:
-            if (_clustering_key.size() > def->component_index()) {
-                _builder.add(_clustering_key[def->component_index()]);
+    auto non_pk_restrictions_map = _restrictions->get_non_pk_restriction();
+    auto partition_key_restrictions_map = _restrictions->get_single_column_partition_key_restrictions();
+    auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
+    for (auto&& cdef : selection.get_columns()) {
+        switch (cdef->kind) {
+        case column_kind::static_column:
+            // fallthrough
+        case column_kind::regular_column:
+            if (cdef->type->is_multi_cell()) {
+                rlogger.debug("Multi-cell filtering is not implemented yet", cdef->name_as_text());
            } else {
-                _builder.add({});
+                auto cell_iterator = (cdef->kind == column_kind::static_column) ? static_row_iterator : row_iterator;
+                auto cell = cell_iterator.next_atomic_cell();
+
+                auto restr_it = non_pk_restrictions_map.find(cdef);
+                if (restr_it == non_pk_restrictions_map.end()) {
+                    continue;
+                }
+                restrictions::single_column_restriction& restriction = *restr_it->second;
+
+                bool regular_restriction_matches;
+                if (cell) {
+                    regular_restriction_matches = cell->value().with_linearized([&restriction](bytes_view data) {
+                        return restriction.is_satisfied_by(data, cql3::query_options({ }));
+                    });
+                } else {
+                    regular_restriction_matches = restriction.is_satisfied_by(bytes(), cql3::query_options({ }));
+                }
+                if (!regular_restriction_matches) {
+                    _current_static_row_does_not_match = (cdef->kind == column_kind::static_column);
+                    return false;
+                }
+
            }
            break;
-        case column_kind::regular_column:
-            add_value(*def, row_iterator);
+        case column_kind::partition_key: {
+            auto restr_it = partition_key_restrictions_map.find(cdef);
+            if (restr_it == partition_key_restrictions_map.end()) {
+                continue;
+            }
+            restrictions::single_column_restriction& restriction = *restr_it->second;
+            const bytes& value_to_check = partition_key[cdef->id];
+            bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, cql3::query_options({ }));
+            if (!pk_restriction_matches) {
+                _current_pratition_key_does_not_match = true;
+                return false;
+            }
+            }
            break;
-        case column_kind::static_column:
-            add_value(*def, static_row_iterator);
+        case column_kind::clustering_key: {
+            auto restr_it = clustering_key_restrictions_map.find(cdef);
+            if (restr_it == clustering_key_restrictions_map.end()) {
+                continue;
+            }
+            restrictions::single_column_restriction& restriction = *restr_it->second;
+            const bytes& value_to_check = clustering_key[cdef->id];
+            bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, cql3::query_options({ }));
+            if (!pk_restriction_matches) {
+                return false;
+            }
+            }
            break;
        default:
-            assert(0);
-        }
-    }
-}
-
-void result_set_builder::visitor::accept_partition_end(
-        const query::result_row_view& static_row) {
-    if (_row_count == 0) {
-        _builder.new_row();
-        auto static_row_iterator = static_row.iterator();
-        for (auto&& def : _selection.get_columns()) {
-            if (def->is_partition_key()) {
-                _builder.add(_partition_key[def->component_index()]);
-            } else if (def->is_static()) {
-                add_value(*def, static_row_iterator);
-            } else {
-                _builder.add_empty();
-            }
+            break;
        }
    }
+    return true;
 }

 api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -48,6 +48,7 @@
 #include "exceptions/exceptions.hh"
 #include "cql3/selection/raw_selector.hh"
 #include "cql3/selection/selector_factories.hh"
+#include "cql3/restrictions/statement_restrictions.hh"
 #include "unimplemented.hh"

 namespace cql3 {
@@ -247,6 +248,28 @@ private:
    const gc_clock::time_point _now;
    cql_serialization_format _cql_serialization_format;
 public:
+    class nop_filter {
+    public:
+        inline bool operator()(const selection&, const std::vector<bytes>&, const std::vector<bytes>&, const query::result_row_view&, const query::result_row_view&) const {
+            return true;
+        }
+        void reset() {
+        }
+    };
+    class restrictions_filter {
+        ::shared_ptr<restrictions::statement_restrictions> _restrictions;
+        mutable bool _current_pratition_key_does_not_match = false;
+        mutable bool _current_static_row_does_not_match = false;
+    public:
+        restrictions_filter() = default;
+        explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions) : _restrictions(restrictions) {}
+        bool operator()(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
+        void reset() {
+            _current_pratition_key_does_not_match = false;
+            _current_static_row_does_not_match = false;
+        }
+    };
+
    result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
    void add_empty();
    void add(bytes_opt value);
@@ -256,8 +279,9 @@ public:
    std::unique_ptr<result_set> build();
    api::timestamp_type timestamp_of(size_t idx);
    int32_t ttl_of(size_t idx);
-    
+
    // Implements ResultVisitor concept from query.hh
+    template<typename Filter = nop_filter>
    class visitor {
    protected:
        result_set_builder& _builder;
@@ -266,20 +290,100 @@ public:
        uint32_t _row_count;
        std::vector<bytes> _partition_key;
        std::vector<bytes> _clustering_key;
+        Filter _filter;
    public:
-        visitor(cql3::selection::result_set_builder& builder, const schema& s, const selection&);
+        visitor(cql3::selection::result_set_builder& builder, const schema& s,
+                const selection& selection, Filter filter = Filter())
+            : _builder(builder)
+            , _schema(s)
+            , _selection(selection)
+            , _row_count(0)
+            , _filter(filter)
+        {}
        visitor(visitor&&) = default;

-        void add_value(const column_definition& def, query::result_row_view::iterator_type& i);
-        void accept_new_partition(const partition_key& key, uint32_t row_count);
-        void accept_new_partition(uint32_t row_count);
-        void accept_new_row(const clustering_key& key,
-                const query::result_row_view& static_row,
-                const query::result_row_view& row);
-        void accept_new_row(const query::result_row_view& static_row,
-                const query::result_row_view& row);
-        void accept_partition_end(const query::result_row_view& static_row);
+        void add_value(const column_definition& def, query::result_row_view::iterator_type& i) {
+            if (def.type->is_multi_cell()) {
+                auto cell = i.next_collection_cell();
+                if (!cell) {
+                    _builder.add_empty();
+                    return;
+                }
+                _builder.add_collection(def, cell->linearize());
+            } else {
+                auto cell = i.next_atomic_cell();
+                if (!cell) {
+                    _builder.add_empty();
+                    return;
+                }
+                _builder.add(def, *cell);
+            }
+        }
+
+        void accept_new_partition(const partition_key& key, uint32_t row_count) {
+            _partition_key = key.explode(_schema);
+            _row_count = row_count;
+            _filter.reset();
+        }
+
+        void accept_new_partition(uint32_t row_count) {
+            _row_count = row_count;
+            _filter.reset();
+        }
+
+        void accept_new_row(const clustering_key& key, const query::result_row_view& static_row, const query::result_row_view& row) {
+            _clustering_key = key.explode(_schema);
+            accept_new_row(static_row, row);
+        }
+
+        void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) {
+            auto static_row_iterator = static_row.iterator();
+            auto row_iterator = row.iterator();
+            if (!_filter(_selection, _partition_key, _clustering_key, static_row, row)) {
+                return;
+            }
+            _builder.new_row();
+            for (auto&& def : _selection.get_columns()) {
+                switch (def->kind) {
+                case column_kind::partition_key:
+                    _builder.add(_partition_key[def->component_index()]);
+                    break;
+                case column_kind::clustering_key:
+                    if (_clustering_key.size() > def->component_index()) {
+                        _builder.add(_clustering_key[def->component_index()]);
+                    } else {
+                        _builder.add({});
+                    }
+                    break;
+                case column_kind::regular_column:
+                    add_value(*def, row_iterator);
+                    break;
+                case column_kind::static_column:
+                    add_value(*def, static_row_iterator);
+                    break;
+                default:
+                    assert(0);
+                }
+            }
+        }
+
+        void accept_partition_end(const query::result_row_view& static_row) {
+            if (_row_count == 0) {
+                _builder.new_row();
+                auto static_row_iterator = static_row.iterator();
+                for (auto&& def : _selection.get_columns()) {
+                    if (def->is_partition_key()) {
+                        _builder.add(_partition_key[def->component_index()]);
+                    } else if (def->is_static()) {
+                        add_value(*def, static_row_iterator);
+                    } else {
+                        _builder.add_empty();
+                    }
+                }
+            }
+        }
    };
+
 private:
    bytes_opt get_value(data_type t, query::result_atomic_cell_view c);
 };
--- a/cql3/selection/selector.hh
+++ b/cql3/selection/selector.hh
@@ -105,11 +105,9 @@ public:
    virtual void reset() = 0;

    virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) override {
-        auto t1 = receiver->type->underlying_type();
-        auto t2 = get_type()->underlying_type();
-        if (t1 == t2) {
+        if (receiver->type == get_type()) {
            return assignment_testable::test_result::EXACT_MATCH;
-        } else if (t1->is_value_compatible_with(*t2)) {
+        } else if (receiver->type->is_value_compatible_with(*get_type())) {
            return assignment_testable::test_result::WEAKLY_ASSIGNABLE;
        } else {
            return assignment_testable::test_result::NOT_ASSIGNABLE;
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -118,7 +118,8 @@ private:
        schema_ptr schema,
        ::shared_ptr<variable_specifications> bound_names,
        ::shared_ptr<selection::selection> selection,
-        bool for_view = false);
+        bool for_view = false,
+        bool allow_filtering = false);

    /** Returns a ::shared_ptr<term> for the limit or null if no limit is set */
    ::shared_ptr<term> prepare_limit(database& db, ::shared_ptr<variable_specifications> bound_names);
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -96,8 +96,12 @@ public:
                encoded_row.write("\\\"", 2);
            }
            encoded_row.write("\": ", 3);
-            sstring row_sstring = _selector_types[i]->to_json_string(parameters[i]);
-            encoded_row.write(row_sstring.c_str(), row_sstring.size());
+            if (parameters[i]) {
+                sstring row_sstring = _selector_types[i]->to_json_string(parameters[i].value());
+                encoded_row.write(row_sstring.c_str(), row_sstring.size());
+            } else {
+                encoded_row.write("null", 4);
+            }
        }
        encoded_row.write("}", 1);
        return encoded_row.linearize().to_string();
@@ -380,6 +384,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
    auto now = gc_clock::now();

    ++_stats.reads;
+    _stats.filtered_reads += _restrictions->need_filtering();

    auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
        make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));
@@ -405,7 +410,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
    command->slice.options.set<query::partition_slice::option::allow_short_read>();
    auto timeout = options.get_timeout_config().*get_timeout_config_selector();
    auto p = service::pager::query_pagers::pager(_schema, _selection,
-            state, options, timeout, command, std::move(key_ranges));
+            state, options, timeout, command, std::move(key_ranges), _stats, _restrictions->need_filtering() ? _restrictions : nullptr);

    if (aggregate) {
        return do_with(
@@ -419,6 +424,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
                    ).then([this, &builder] {
                                auto rs = builder.build();
                                update_stats_rows_read(rs->size());
+                                _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
                                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
                            });
@@ -431,7 +437,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
                        " you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
    }

-    if (_selection->is_trivial()) {
+    if (_selection->is_trivial() && !_restrictions->need_filtering()) {
        return p->fetch_page_generator(page_size, now, _stats).then([this, p, limit] (result_generator generator) {
            auto meta = make_shared<metadata>(*_selection->get_result_metadata());
            if (!p->is_exhausted()) {
@@ -452,6 +458,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
                }

                update_stats_rows_read(rs->size());
+                _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
            });
@@ -550,7 +557,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
                                  const query_options& options,
                                  gc_clock::time_point now)
 {
-    bool fast_path = !needs_post_query_ordering() && _selection->is_trivial();
+    bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !_restrictions->need_filtering();
    if (fast_path) {
        return make_shared<cql_transport::messages::result_message::rows>(result(
            result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
@@ -560,9 +567,17 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu

    cql3::selection::result_set_builder builder(*_selection, now,
            options.get_cql_serialization_format());
-    query::result_view::consume(*results, cmd->slice,
-            cql3::selection::result_set_builder::visitor(builder, *_schema,
-                    *_selection));
+    if (_restrictions->need_filtering()) {
+        results->ensure_counts();
+        _stats.filtered_rows_read_total += *results->row_count();
+        query::result_view::consume(*results, cmd->slice,
+                cql3::selection::result_set_builder::visitor(builder, *_schema,
+                        *_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions)));
+    } else {
+        query::result_view::consume(*results, cmd->slice,
+                cql3::selection::result_set_builder::visitor(builder, *_schema,
+                        *_selection));
+    }
    auto rs = builder.build();

    if (needs_post_query_ordering()) {
@@ -573,6 +588,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
        rs->trim(cmd->row_limit);
    }
    update_stats_rows_read(rs->size());
+    _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
    return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
 }

@@ -953,7 +969,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
                     ? selection::selection::wildcard(schema)
                     : selection::selection::from_selectors(db, schema, _select_clause);

-    auto restrictions = prepare_restrictions(db, schema, bound_names, selection, for_view);
+    auto restrictions = prepare_restrictions(db, schema, bound_names, selection, for_view, _parameters->allow_filtering());

    if (_parameters->is_distinct()) {
        validate_distinct_selection(schema, selection, restrictions);
@@ -970,10 +986,6 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
    }

    check_needs_filtering(restrictions);
-    size_t restrictions_size = restrictions->get_partition_key_restrictions()->size() + restrictions->get_clustering_columns_restrictions()->size() + restrictions->get_non_pk_restriction().size();
-    if (restrictions->uses_secondary_indexing() && restrictions_size > 1) {
-        throw exceptions::invalid_request_exception("Indexed query may not contain multiple restrictions in 2.3");
-    }

    ::shared_ptr<cql3::statements::select_statement> stmt;
    if (restrictions->uses_secondary_indexing()) {
@@ -1011,13 +1023,14 @@ select_statement::prepare_restrictions(database& db,
                                       schema_ptr schema,
                                       ::shared_ptr<variable_specifications> bound_names,
                                       ::shared_ptr<selection::selection> selection,
-                                       bool for_view)
+                                       bool for_view,
+                                       bool allow_filtering)
 {
    try {
        // FIXME: this method should take a separate allow_filtering parameter
        // and pass it on. Currently we pass "for_view" as allow_filtering.
        return ::make_shared<restrictions::statement_restrictions>(db, schema, statement_type::SELECT, std::move(_where_clause), bound_names,
-            selection->contains_only_static_columns(), selection->contains_a_collection(), for_view, for_view);
+            selection->contains_only_static_columns(), selection->contains_a_collection(), for_view, allow_filtering);
    } catch (const exceptions::unrecognized_entity_exception& e) {
        if (contains_alias(e.entity)) {
            throw exceptions::invalid_request_exception(sprint("Aliases aren't allowed in the where clause ('%s')", e.relation->to_string()));
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -179,21 +179,7 @@ modification_statement::json_cache_opt insert_prepared_json_statement::maybe_pre
 void
 insert_prepared_json_statement::execute_set_value(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const column_definition& column, const bytes_opt& value) {
    if (!value) {
-        if (column.type->is_collection()) {
-            auto& k = static_pointer_cast<const collection_type_impl>(column.type)->_kind;
-            if (&k == &collection_type_impl::kind::list) {
-                lists::setter::execute(m, prefix, params, column, make_shared<lists::value>(lists::value(std::vector<bytes_opt>())));
-            } else if (&k == &collection_type_impl::kind::set) {
-                sets::setter::execute(m, prefix, params, column, make_shared<sets::value>(sets::value(std::set<bytes, serialized_compare>(serialized_compare(empty_type)))));
-            } else if (&k == &collection_type_impl::kind::map) {
-                maps::setter::execute(m, prefix, params, column, make_shared<maps::value>(maps::value(std::map<bytes, bytes, serialized_compare>(serialized_compare(empty_type)))));
-            } else {
-                throw exceptions::invalid_request_exception("Incorrect value kind in JSON INSERT statement");
-            }
-            return;
-        }
        m.set_cell(prefix, column, std::move(operation::make_dead_cell(params)));
-        return;
    } else if (!column.type->is_collection()) {
        constants::setter::execute(m, prefix, params, column, raw_value_view::make_value(bytes_view(*value)));
        return;
@@ -218,17 +204,15 @@ insert_prepared_json_statement::execute_set_value(mutation& m, const clustering_
 dht::partition_range_vector
 insert_prepared_json_statement::build_partition_keys(const query_options& options, const json_cache_opt& json_cache) {
    dht::partition_range_vector ranges;
-    std::vector<bytes_opt> exploded;
    for (const auto& def : s->partition_key_columns()) {
        auto json_value = json_cache->at(def.name_as_text());
-        if (!json_value) {
-            throw exceptions::invalid_request_exception(sprint("Missing mandatory PRIMARY KEY part %s", def.name_as_text()));
-        }
-        exploded.emplace_back(*json_value);
+        auto k = query::range<partition_key>::make_singular(partition_key::from_single_value(*s, json_value.value()));
+        ranges.emplace_back(std::move(k).transform(
+                    [this] (partition_key&& k) -> query::ring_position {
+                        auto token = dht::global_partitioner().get_token(*s, k);
+                        return { std::move(token), std::move(k) };
+                    }));
    }
-    auto pkey = partition_key::from_optional_exploded(*s, std::move(exploded));
-    auto k = query::range<query::ring_position>::make_singular(dht::global_partitioner().decorate_key(*s, std::move(pkey)));
-    ranges.emplace_back(std::move(k));
    return ranges;
 }

@@ -237,10 +221,7 @@ query::clustering_row_ranges insert_prepared_json_statement::create_clustering_r
    std::vector<bytes_opt> exploded;
    for (const auto& def : s->clustering_key_columns()) {
        auto json_value = json_cache->at(def.name_as_text());
-        if (!json_value) {
-            throw exceptions::invalid_request_exception(sprint("Missing mandatory PRIMARY KEY part %s", def.name_as_text()));
-        }
-        exploded.emplace_back(*json_value);
+        exploded.emplace_back(json_value.value());
    }
    auto k = query::range<clustering_key_prefix>::make_singular(clustering_key_prefix::from_optional_exploded(*s, std::move(exploded)));
    ranges.emplace_back(query::clustering_range(std::move(k)));
--- a/cql3/stats.hh
+++ b/cql3/stats.hh
@@ -41,6 +41,10 @@ struct cql_stats {
    int64_t secondary_index_drops = 0;
    int64_t secondary_index_reads = 0;
    int64_t secondary_index_rows_read = 0;
+
+    int64_t filtered_reads = 0;
+    int64_t filtered_rows_matched_total = 0;
+    int64_t filtered_rows_read_total = 0;
 };

 }
--- a/cql3/tuples.hh
+++ b/cql3/tuples.hh
@@ -405,7 +405,7 @@ public:
        in_marker(int32_t bind_index, ::shared_ptr<column_specification> receiver)
            : abstract_marker(bind_index, std::move(receiver))
        {
-            assert(dynamic_pointer_cast<const list_type_impl>(_receiver->type));
+            assert(dynamic_pointer_cast<const list_type_impl>(receiver->type));
        }

        virtual shared_ptr<terminal> bind(const query_options& options) override {
--- a/cql3/update_parameters.cc
+++ b/cql3/update_parameters.cc
@@ -53,9 +53,6 @@ update_parameters::get_prefetched_list(
        return {};
    }

-    if (column.is_static()) {
-        ckey = clustering_key_view::make_empty();
-    }
    auto i = _prefetched->rows.find(std::make_pair(std::move(pkey), std::move(ckey)));
    if (i == _prefetched->rows.end()) {
        return {};
--- a/database.cc
+++ b/database.cc
@@ -182,7 +182,7 @@ thread_local dirty_memory_manager default_dirty_memory_manager;
 lw_shared_ptr<memtable_list>
 table::make_memory_only_memtable_list() {
    auto get_schema = [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(get_schema), _config.dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(get_schema), _config.dirty_memory_manager, _config.memory_compaction_scheduling_group);
 }

 lw_shared_ptr<memtable_list>
@@ -191,7 +191,7 @@ table::make_memtable_list() {
        return seal_active_memtable(std::move(permit));
    };
    auto get_schema = [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.dirty_memory_manager, _config.memory_compaction_scheduling_group);
 }

 lw_shared_ptr<memtable_list>
@@ -200,7 +200,7 @@ table::make_streaming_memtable_list() {
        return seal_active_streaming_memtable_immediate(std::move(permit));
    };
    auto get_schema =  [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager, _config.streaming_scheduling_group);
 }

 lw_shared_ptr<memtable_list>
@@ -209,7 +209,7 @@ table::make_streaming_memtable_big_list(streaming_memtable_big& smb) {
        return seal_active_streaming_memtable_big(smb, std::move(permit));
    };
    auto get_schema =  [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager, _config.streaming_scheduling_group);
 }

 table::table(schema_ptr schema, config config, db::commitlog* cl, compaction_manager& compaction_manager, cell_locker_stats& cl_stats, cache_tracker& row_cache_tracker)
@@ -237,7 +237,7 @@ partition_presence_checker
 table::make_partition_presence_checker(lw_shared_ptr<sstables::sstable_set> sstables) {
    auto sel = make_lw_shared(sstables->make_incremental_selector());
    return [this, sstables = std::move(sstables), sel = std::move(sel)] (const dht::decorated_key& key) {
-        auto& sst = sel->select(key.token()).sstables;
+        auto& sst = sel->select(key).sstables;
        if (sst.empty()) {
            return partition_presence_checker_result::definitely_doesnt_exist;
        }
@@ -453,7 +453,7 @@ public:
            const dht::partition_range& pr,
            tracing::trace_state_ptr trace_state,
            sstable_reader_factory_type fn)
-        : reader_selector(s, pr.start() ? pr.start()->value() : dht::ring_position::min())
+        : reader_selector(s, pr.start() ? pr.start()->value() : dht::ring_position_view::min())
        , _pr(&pr)
        , _sstables(std::move(sstables))
        , _trace_state(std::move(trace_state))
@@ -472,47 +472,34 @@ public:
    incremental_reader_selector(incremental_reader_selector&&) = delete;
    incremental_reader_selector& operator=(incremental_reader_selector&&) = delete;

-    virtual std::vector<flat_mutation_reader> create_new_readers(const dht::token* const t) override {
-        dblog.trace("incremental_reader_selector {}: {}({})", this, __FUNCTION__, seastar::lazy_deref(t));
+    virtual std::vector<flat_mutation_reader> create_new_readers(const std::optional<dht::ring_position_view>& pos) override {
+        dblog.trace("incremental_reader_selector {}: {}({})", this, __FUNCTION__, seastar::lazy_deref(pos));

-        const auto& position = (t ? *t : _selector_position.token());
-        // we only pass _selector_position's token to _selector::select() when T is nullptr
-        // because it means gap between sstables, and the lower bound of the first interval
-        // after the gap is guaranteed to be inclusive.
-        auto selection = _selector.select(position);
+        auto readers = std::vector<flat_mutation_reader>();

-        if (selection.sstables.empty()) {
-            // For the lower bound of the token range the _selector
-            // might not return any sstables, in this case try again
-            // with next_token unless it's maximum token.
-            if (!selection.next_position.is_max()
-                    && position == (_pr->start() ? _pr->start()->value().token() : dht::minimum_token())) {
-                dblog.trace("incremental_reader_selector {}: no sstables intersect with the lower bound, retrying", this);
-                _selector_position = std::move(selection.next_position);
-                return create_new_readers(nullptr);
-            }
+        do {
+            auto selection = _selector.select(_selector_position);
+            _selector_position = selection.next_position;

-            _selector_position = dht::ring_position::max();
-            return {};
-        }
+            dblog.trace("incremental_reader_selector {}: {} sstables to consider, advancing selector to {}", this, selection.sstables.size(),
+                    _selector_position);

-        _selector_position = std::move(selection.next_position);
+            readers = boost::copy_range<std::vector<flat_mutation_reader>>(selection.sstables
+                    | boost::adaptors::filtered([this] (auto& sst) { return _read_sstables.emplace(sst).second; })
+                    | boost::adaptors::transformed([this] (auto& sst) { return this->create_reader(sst); }));
+        } while (!_selector_position.is_max() && readers.empty() && (!pos || dht::ring_position_tri_compare(*_s, *pos, _selector_position) >= 0));

-        dblog.trace("incremental_reader_selector {}: {} new sstables to consider, advancing selector to {}", this, selection.sstables.size(), _selector_position);
+        dblog.trace("incremental_reader_selector {}: created {} new readers", this, readers.size());

-        return boost::copy_range<std::vector<flat_mutation_reader>>(selection.sstables
-                | boost::adaptors::filtered([this] (auto& sst) { return _read_sstables.emplace(sst).second; })
-                | boost::adaptors::transformed([this] (auto& sst) {
-                    return this->create_reader(sst);
-                }));
+        return readers;
    }

    virtual std::vector<flat_mutation_reader> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
        _pr = &pr;

-        dht::ring_position_comparator cmp(*_s);
-        if (cmp(dht::ring_position_view::for_range_start(*_pr), _selector_position) >= 0) {
-            return create_new_readers(&_pr->start()->value().token());
+        auto pos = dht::ring_position_view::for_range_start(*_pr);
+        if (dht::ring_position_tri_compare(*_s, pos, _selector_position) >= 0) {
+            return create_new_readers(pos);
        }

        return {};
@@ -961,11 +948,6 @@ table::seal_active_memtable(flush_permit&& permit) {
    }
    _memtables->add_memtable();
    _stats.memtable_switch_count++;
-    // This will set evictable occupancy of the old memtable region to zero, so that
-    // this region is considered last for flushing by dirty_memory_manager::flush_when_needed().
-    // If we don't do that, the flusher may keep picking up this memtable list for flushing after
-    // the permit is released even though there is not much to flush in the active memtable of this list.
-    old->region().ground_evictable_occupancy();
    auto previous_flush = _flush_barrier.advance_and_await();
    auto op = _flush_barrier.start();

@@ -1334,7 +1316,6 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new

    // This is done in the background, so we can consider this compaction completed.
    seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
-       return with_semaphore(_sstable_deletion_sem, 1, [this, sstables_to_remove = std::move(sstables_to_remove)] {
        return sstables::delete_atomically(sstables_to_remove, *get_large_partition_handler()).then_wrapped([this, sstables_to_remove] (future<> f) {
            std::exception_ptr eptr;
            try {
@@ -1358,7 +1339,6 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new
                return make_exception_future<>(eptr);
            }
            return make_ready_future<>();
-         });
        }).then([this] {
            // refresh underlying data source in row cache to prevent it from holding reference
            // to sstables files which were previously deleted.
@@ -1480,10 +1460,7 @@ future<> table::cleanup_sstables(sstables::compaction_descriptor descriptor) {
            static thread_local semaphore sem(1);

            return with_semaphore(sem, 1, [this, &sst] {
-                // release reference to sstables cleaned up, otherwise space usage from their data and index
-                // components cannot be reclaimed until all of them are cleaned.
-                auto sstable_level = sst->get_sstable_level();
-                return this->compact_sstables(sstables::compaction_descriptor({ std::move(sst) }, sstable_level), true);
+                return this->compact_sstables(sstables::compaction_descriptor({ sst }, sst->get_sstable_level()), true);
            });
        });
    });
@@ -1576,7 +1553,7 @@ future<std::unordered_set<sstring>> table::get_sstables_by_partition_key(const s
            [this] (std::unordered_set<sstring>& filenames, lw_shared_ptr<sstables::sstable_set::incremental_selector>& sel, partition_key& pk) {
        return do_with(dht::decorated_key(dht::global_partitioner().decorate_key(*_schema, pk)),
                [this, &filenames, &sel, &pk](dht::decorated_key& dk) mutable {
-            auto sst = sel->select(dk.token()).sstables;
+            auto sst = sel->select(dk).sstables;
            auto hk = sstables::sstable::make_hashed_key(*_schema, dk.key());

            return do_for_each(sst, [this, &filenames, &dk, hk = std::move(hk)] (std::vector<sstables::shared_sstable>::const_iterator::reference s) mutable {
@@ -1665,9 +1642,9 @@ future<> distributed_loader::open_sstable(distributed<database>& db, sstables::e
    // to distribute evenly the resource usage among all shards.

    return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
-            [&db, comps = std::move(comps), func = std::move(func), &pc] (database& local) {
+            [&db, comps = std::move(comps), func = std::move(func), pc] (database& local) {

-        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), &pc] {
+        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), pc] {
            auto& cf = local.find_column_family(comps.ks, comps.cf);

            auto f = sstables::sstable::load_shared_components(cf.schema(), cf._config.datadir, comps.generation, comps.version, comps.format, pc);
@@ -2168,6 +2145,8 @@ database::database(const db::config& cfg, database_config dbcfg)
    _compaction_manager->start();
    setup_metrics();

+    _row_cache_tracker.set_compaction_scheduling_group(dbcfg.memory_compaction_scheduling_group);
+
    dblog.info("Row: max_vector_size: {}, internal_count: {}", size_t(row::max_vector_size), size_t(row::internal_count));
 }

@@ -2855,6 +2834,7 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
    cfg.cf_stats = _config.cf_stats;
    cfg.enable_incremental_backups = _config.enable_incremental_backups;
    cfg.compaction_scheduling_group = _config.compaction_scheduling_group;
+    cfg.memory_compaction_scheduling_group = _config.memory_compaction_scheduling_group;
    cfg.memtable_scheduling_group = _config.memtable_scheduling_group;
    cfg.memtable_to_cache_scheduling_group = _config.memtable_to_cache_scheduling_group;
    cfg.streaming_scheduling_group = _config.streaming_scheduling_group;
@@ -3406,7 +3386,7 @@ future<> memtable_list::request_flush() {
 }

 lw_shared_ptr<memtable> memtable_list::new_memtable() {
-    return make_lw_shared<memtable>(_current_schema(), *_dirty_memory_manager, this);
+    return make_lw_shared<memtable>(_current_schema(), *_dirty_memory_manager, this, _compaction_scheduling_group);
 }

 future<flush_permit> flush_permit::reacquire_sstable_write_permit() && {
@@ -3447,13 +3427,6 @@ future<> dirty_memory_manager::flush_when_needed() {
                // release the biggest amount of memory and is less likely to be generating tiny
                // SSTables.
                memtable& candidate_memtable = memtable::from_region(*(this->_virtual_region_group.get_largest_region()));
-
-                if (candidate_memtable.empty()) {
-                    // Soft pressure, but nothing to flush. It could be due to fsync or memtable_to_cache lagging.
-                    // Back off to avoid OOMing with flush continuations.
-                    return sleep(1ms);
-                }
-
                // Do not wait. The semaphore will protect us against a concurrent flush. But we
                // want to start a new one as soon as the permits are destroyed and the semaphore is
                // made ready again, not when we are done with the current one.
@@ -3642,6 +3615,7 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
    cfg.enable_incremental_backups = _enable_incremental_backups;

    cfg.compaction_scheduling_group = _dbcfg.compaction_scheduling_group;
+    cfg.memory_compaction_scheduling_group = _dbcfg.memory_compaction_scheduling_group;
    cfg.memtable_scheduling_group = _dbcfg.memtable_scheduling_group;
    cfg.memtable_to_cache_scheduling_group = _dbcfg.memtable_to_cache_scheduling_group;
    cfg.streaming_scheduling_group = _dbcfg.streaming_scheduling_group;
@@ -4001,7 +3975,6 @@ seal_snapshot(sstring jsondir) {

 future<> table::snapshot(sstring name) {
    return flush().then([this, name = std::move(name)]() {
-       return with_semaphore(_sstable_deletion_sem, 1, [this, name = std::move(name)]() {
        auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
        return do_with(std::move(tables), [this, name](std::vector<sstables::shared_sstable> & tables) {
            auto jsondir = _config.datadir + "/snapshots/" + name;
@@ -4066,7 +4039,6 @@ future<> table::snapshot(sstring name) {
                });
            });
        });
-       });
    });
 }

@@ -4198,7 +4170,6 @@ future<> table::fail_streaming_mutations(utils::UUID plan_id) {
    _streaming_memtables_big.erase(it);
    return entry->flush_in_progress.close().then([this, entry] {
        for (auto&& sst : entry->sstables) {
-            sst.monitor->write_failed();
            sst.sstable->mark_for_deletion();
        }
    });
@@ -4630,14 +4601,11 @@ flat_mutation_reader make_local_shard_sstable_reader(schema_ptr s,
        }
        return reader;
    };
-    auto all_readers = boost::copy_range<std::vector<flat_mutation_reader>>(
-            *sstables->all()
-            | boost::adaptors::transformed([&] (sstables::shared_sstable sst) -> flat_mutation_reader {
-                return reader_factory_fn(sst, pr);
-            })
-    );
-    return make_combined_reader(s,
-            std::move(all_readers),
+    return make_combined_reader(s, std::make_unique<incremental_reader_selector>(s,
+                    std::move(sstables),
+                    pr,
+                    std::move(trace_state),
+                    std::move(reader_factory_fn)),
            fwd,
            fwd_mr);
 }
@@ -4656,14 +4624,11 @@ flat_mutation_reader make_range_sstable_reader(schema_ptr s,
    auto reader_factory_fn = [s, &slice, &pc, resource_tracker, fwd, fwd_mr, &monitor_generator] (sstables::shared_sstable& sst, const dht::partition_range& pr) {
        return sst->read_range_rows_flat(s, pr, slice, pc, resource_tracker, fwd, fwd_mr, monitor_generator(sst));
    };
-    auto sstable_readers = boost::copy_range<std::vector<flat_mutation_reader>>(
-            *sstables->all()
-            | boost::adaptors::transformed([&] (sstables::shared_sstable sst) {
-                return reader_factory_fn(sst, pr);
-            })
-    );
-    return make_combined_reader(s,
-            std::move(sstable_readers),
+    return make_combined_reader(s, std::make_unique<incremental_reader_selector>(s,
+                    std::move(sstables),
+                    pr,
+                    std::move(trace_state),
+                    std::move(reader_factory_fn)),
            fwd,
            fwd_mr);
 }
--- a/database.hh
+++ b/database.hh
@@ -164,29 +164,33 @@ private:
    std::function<schema_ptr()> _current_schema;
    dirty_memory_manager* _dirty_memory_manager;
    std::experimental::optional<shared_promise<>> _flush_coalescing;
+    seastar::scheduling_group _compaction_scheduling_group;
 public:
    memtable_list(
            seal_immediate_fn_type seal_immediate_fn,
            seal_delayed_fn_type seal_delayed_fn,
            std::function<schema_ptr()> cs,
-            dirty_memory_manager* dirty_memory_manager)
+            dirty_memory_manager* dirty_memory_manager,
+            seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
        : _memtables({})
        , _seal_immediate_fn(seal_immediate_fn)
        , _seal_delayed_fn(seal_delayed_fn)
        , _current_schema(cs)
-        , _dirty_memory_manager(dirty_memory_manager) {
+        , _dirty_memory_manager(dirty_memory_manager)
+        , _compaction_scheduling_group(compaction_scheduling_group) {
        add_memtable();
    }

    memtable_list(
            seal_immediate_fn_type seal_immediate_fn,
            std::function<schema_ptr()> cs,
-            dirty_memory_manager* dirty_memory_manager)
-        : memtable_list(std::move(seal_immediate_fn), {}, std::move(cs), dirty_memory_manager) {
+            dirty_memory_manager* dirty_memory_manager,
+            seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
+        : memtable_list(std::move(seal_immediate_fn), {}, std::move(cs), dirty_memory_manager, compaction_scheduling_group) {
    }

-    memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
-        : memtable_list({}, {}, std::move(cs), dirty_memory_manager) {
+    memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager, seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
+        : memtable_list({}, {}, std::move(cs), dirty_memory_manager, compaction_scheduling_group) {
    }

    bool may_flush() const {
@@ -294,8 +298,6 @@ public:
 class table;
 using column_family = table;

-class database_sstable_write_monitor;
-
 class table : public enable_lw_shared_from_this<table> {
 public:
    struct config {
@@ -314,6 +316,7 @@ public:
        seastar::scheduling_group memtable_scheduling_group;
        seastar::scheduling_group memtable_to_cache_scheduling_group;
        seastar::scheduling_group compaction_scheduling_group;
+        seastar::scheduling_group memory_compaction_scheduling_group;
        seastar::scheduling_group statement_scheduling_group;
        seastar::scheduling_group streaming_scheduling_group;
        bool enable_metrics_reporting = false;
@@ -391,7 +394,7 @@ private:
    // plan memtables and the resulting sstables are not made visible until
    // the streaming is complete.
    struct monitored_sstable {
-        std::unique_ptr<database_sstable_write_monitor> monitor;
+        std::unique_ptr<sstables::write_monitor> monitor;
        sstables::shared_sstable sstable;
    };

@@ -430,10 +433,6 @@ private:
    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
    // Control background fibers waiting for sstables to be deleted
    seastar::gate _sstable_deletion_gate;
-    // This semaphore ensures that an operation like snapshot won't have its selected
-    // sstables deleted by compaction in parallel, a race condition which could
-    // easily result in failure.
-    seastar::semaphore _sstable_deletion_sem = {1};
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
    rwlock _sstables_lock;
@@ -1045,6 +1044,7 @@ public:
        seastar::scheduling_group memtable_scheduling_group;
        seastar::scheduling_group memtable_to_cache_scheduling_group;
        seastar::scheduling_group compaction_scheduling_group;
+        seastar::scheduling_group memory_compaction_scheduling_group;
        seastar::scheduling_group statement_scheduling_group;
        seastar::scheduling_group streaming_scheduling_group;
        bool enable_metrics_reporting = false;
@@ -1125,6 +1125,7 @@ struct database_config {
    seastar::scheduling_group memtable_scheduling_group;
    seastar::scheduling_group memtable_to_cache_scheduling_group; // FIXME: merge with memtable_scheduling_group
    seastar::scheduling_group compaction_scheduling_group;
+    seastar::scheduling_group memory_compaction_scheduling_group;
    seastar::scheduling_group statement_scheduling_group;
    seastar::scheduling_group streaming_scheduling_group;
    size_t available_memory;
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -163,7 +163,7 @@ future<> db::commitlog_replayer::impl::init() {
                // Get all truncation records for the CF and initialize max rps if
                // present. Cannot do this on demand, as there may be no sstables to
                // mark the CF as "needed".
-                return db::system_keyspace::get_truncated_position(uuid).then([&map, uuid](std::vector<db::replay_position> tpps) {
+                return db::system_keyspace::get_truncated_position(uuid).then([&map, &uuid](std::vector<db::replay_position> tpps) {
                    for (auto& p : tpps) {
                        rlogger.trace("CF {} truncated at {}", uuid, p);
                        auto& pp = map[p.shard_id()][uuid];
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -686,7 +686,33 @@ read_keyspace_mutation(distributed<service::storage_proxy>& proxy, const sstring
 static semaphore the_merge_lock {1};

 future<> merge_lock() {
-    return smp::submit_to(0, [] { return the_merge_lock.wait(); });
+    // ref:  #1088
+    // to avoid deadlocks, we don't want long-standing calls to the shard 0
+    // as they can cause a deadlock:
+    //
+    //   fiber1                fiber2
+    //   merge_lock()                         (succeeds)
+    //                         merge_lock()   (waits)
+    //   invoke_on_all()                      (waits on merge_lock to relinquish smp::submit_to slot)
+    //
+    // so we issue the lock calls with a timeout; the slot will be relinquished, and invoke_on_all()
+    // can complete
+    return repeat([] () mutable {
+        return smp::submit_to(0, [] {
+            return the_merge_lock.try_wait();
+        }).then([] (bool result) {
+            if (result) {
+                return make_ready_future<stop_iteration>(stop_iteration::yes);
+            } else {
+                static thread_local auto rand_engine = std::default_random_engine();
+                auto dist = std::uniform_int_distribution<int>(0, 100);
+                auto to = std::chrono::microseconds(dist(rand_engine));
+                return sleep(to).then([] {
+                    return make_ready_future<stop_iteration>(stop_iteration::no);
+                });
+            }
+        });
+    });
 }

 future<> merge_unlock() {
--- a/db/size_estimates_virtual_reader.cc
+++ b/db/size_estimates_virtual_reader.cc
@@ -1,329 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- *
- * Modified by ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <boost/range/adaptor/indirected.hpp>
-#include <boost/range/adaptor/map.hpp>
-#include <boost/range/adaptor/transformed.hpp>
-#include <boost/range/algorithm/find_if.hpp>
-
-#include "clustering_bounds_comparator.hh"
-#include "database.hh"
-#include "db/system_keyspace.hh"
-#include "dht/i_partitioner.hh"
-#include "partition_range_compat.hh"
-#include "range.hh"
-#include "service/storage_service.hh"
-#include "stdx.hh"
-#include "mutation_fragment.hh"
-#include "sstables/sstables.hh"
-#include "db/timeout_clock.hh"
-#include "database.hh"
-
-#include "db/size_estimates_virtual_reader.hh"
-
-namespace db {
-
-namespace size_estimates {
-
-struct virtual_row {
-    const bytes& cf_name;
-    const token_range& tokens;
-    clustering_key_prefix as_key() const {
-        return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
-    }
-};
-
-struct virtual_row_comparator {
-    schema_ptr _schema;
-    virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
-    bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
-        return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
-    }
-    bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
-        return operator()(row.as_key(), key);
-    }
-    bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
-        return operator()(key, row.as_key());
-    }
-};
-
-// Iterating over the cartesian product of cf_names and token_ranges.
-class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
-    std::reference_wrapper<const std::vector<bytes>> _cf_names;
-    std::reference_wrapper<const std::vector<token_range>> _ranges;
-    size_t _cf_names_idx = 0;
-    size_t _ranges_idx = 0;
-public:
-    struct end_iterator_tag {};
-    virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
-            : _cf_names(std::ref(cf_names))
-            , _ranges(std::ref(ranges))
-    { }
-    virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
-            : _cf_names(std::ref(cf_names))
-            , _ranges(std::ref(ranges))
-            , _cf_names_idx(cf_names.size())
-            , _ranges_idx(ranges.size())
-    {
-        if (cf_names.empty() || ranges.empty()) {
-            // The product of an empty range with any range is an empty range.
-            // In this case we want the end iterator to be equal to the begin iterator,
-            // which has_ranges_idx = _cf_names_idx = 0.
-            _ranges_idx = _cf_names_idx = 0;
-        }
-    }
-    virtual_row_iterator& operator++() {
-        if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
-            _ranges_idx = 0;
-        }
-        return *this;
-    }
-    virtual_row_iterator operator++(int) {
-        virtual_row_iterator i(*this);
-        ++(*this);
-        return i;
-    }
-    const value_type operator*() const {
-        return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
-    }
-    bool operator==(const virtual_row_iterator& i) const {
-        return _cf_names_idx == i._cf_names_idx
-            && _ranges_idx == i._ranges_idx;
-    }
-    bool operator!=(const virtual_row_iterator& i) const {
-        return !(*this == i);
-    }
-};
-
-/**
- * Returns the keyspaces, ordered by name, as selected by the partition_range.
- */
-static std::vector<sstring> get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
-    struct keyspace_less_comparator {
-        const schema& _s;
-        keyspace_less_comparator(const schema& s) : _s(s) { }
-        dht::ring_position as_ring_position(const sstring& ks) {
-            auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
-            return dht::global_partitioner().decorate_key(_s, std::move(pkey));
-        }
-        bool operator()(const sstring& ks1, const sstring& ks2) {
-            return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
-        }
-        bool operator()(const sstring& ks, const dht::ring_position& rp) {
-            return as_ring_position(ks).less_compare(_s, rp);
-        }
-        bool operator()(const dht::ring_position& rp, const sstring& ks) {
-            return rp.less_compare(_s, as_ring_position(ks));
-        }
-    };
-    auto keyspaces = db.get_non_system_keyspaces();
-    auto cmp = keyspace_less_comparator(s);
-    boost::sort(keyspaces, cmp);
-    return boost::copy_range<std::vector<sstring>>(
-        range.slice(keyspaces, std::move(cmp)) | boost::adaptors::filtered([&s] (const auto& ks) {
-            // If this is a range query, results are divided between shards by the partition key (keyspace_name).
-            return shard_of(dht::global_partitioner().get_token(s,
-                        partition_key::from_single_value(s, utf8_type->decompose(ks))))
-                == engine().cpu_id();
-        })
-    );
-}
-
-/**
- * Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
- */
-static dht::partition_range as_ring_position_range(dht::token_range& r) {
-    stdx::optional<range<dht::ring_position>::bound> start_bound, end_bound;
-    if (r.start()) {
-        start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
-    }
-    if (r.end()) {
-        end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
-    }
-    return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
-}
-
-/**
- * Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
- */
-static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
-    int64_t count{0};
-    utils::estimated_histogram hist{0};
-    auto from_bytes = [] (auto& b) {
-        return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
-    };
-    dht::token_range_vector ranges;
-    compat::unwrap_into(
-        wrapping_range<dht::token>({{ from_bytes(r.start) }}, {{ from_bytes(r.end) }}),
-        dht::token_comparator(),
-        [&] (auto&& rng) { ranges.push_back(std::move(rng)); });
-    for (auto&& r : ranges) {
-        auto rp_range = as_ring_position_range(r);
-        for (auto&& sstable : cf.select_sstables(rp_range)) {
-            count += sstable->estimated_keys_for_range(r);
-            hist.merge(sstable->get_stats_metadata().estimated_row_size);
-        }
-    }
-    return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
-}
-
-future<std::vector<token_range>> get_local_ranges() {
-    auto& ss = service::get_local_storage_service();
-    return ss.get_local_tokens().then([&ss] (auto&& tokens) {
-        auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
-        std::vector<token_range> local_ranges;
-        auto to_bytes = [](const stdx::optional<dht::token_range::bound>& b) {
-            assert(b);
-            return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
-        };
-        // We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
-        // All queries will be on that table, where all entries are text and there's no notion of
-        // token ranges form the CQL point of view.
-        auto left_inf = boost::find_if(ranges, [] (auto&& r) {
-            return !r.start() || r.start()->value() == dht::minimum_token();
-        });
-        auto right_inf = boost::find_if(ranges, [] (auto&& r) {
-            return !r.end() || r.start()->value() == dht::maximum_token();
-        });
-        if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
-            local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
-            ranges.erase(left_inf);
-            ranges.erase(right_inf);
-        }
-        for (auto&& r : ranges) {
-            local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
-        }
-        boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
-            return utf8_type->less(tr1.start, tr2.start);
-        });
-        return local_ranges;
-    });
-}
-
-size_estimates_mutation_reader::size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
-            : impl(schema)
-            , _schema(std::move(schema))
-            , _prange(&prange)
-            , _slice(slice)
-            , _fwd(fwd)
-    { }
-
-future<> size_estimates_mutation_reader::get_next_partition() {
-    auto& db = service::get_local_storage_proxy().get_db().local();
-    if (!_keyspaces) {
-        _keyspaces = get_keyspaces(*_schema, db, *_prange);
-        _current_partition = _keyspaces->begin();
-    }
-    if (_current_partition == _keyspaces->end()) {
-        _end_of_stream = true;
-        return make_ready_future<>();
-    }
-    return get_local_ranges().then([&db, this] (auto&& ranges) {
-        auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
-        auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
-        ++_current_partition;
-        std::vector<mutation> ms;
-        ms.emplace_back(std::move(mutations));
-        _partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
-    });
-}
-
-future<> size_estimates_mutation_reader::fill_buffer(db::timeout_clock::time_point timeout) {
-    return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
-        if (!_partition_reader) {
-            return get_next_partition();
-        }
-        return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
-            push_mutation_fragment(std::move(mf));
-            return stop_iteration(is_buffer_full());
-        }, timeout).then([this] {
-            if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
-                _partition_reader = stdx::nullopt;
-            }
-        });
-    });
-}
-
-void size_estimates_mutation_reader::next_partition() {
-    clear_buffer_to_next_partition();
-    if (is_buffer_empty()) {
-        _partition_reader = stdx::nullopt;
-    }
-}
-
-future<> size_estimates_mutation_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
-    clear_buffer();
-    _prange = &pr;
-    _keyspaces = stdx::nullopt;
-    _partition_reader = stdx::nullopt;
-    _end_of_stream = false;
-    return make_ready_future<>();
-}
-
-future<> size_estimates_mutation_reader::fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) {
-    forward_buffer_to(pr.start());
-    _end_of_stream = false;
-    if (_partition_reader) {
-        return _partition_reader->fast_forward_to(std::move(pr), timeout);
-    }
-    return make_ready_future<>();
-}
-
-size_t size_estimates_mutation_reader::buffer_size() const {
-    if (_partition_reader) {
-        return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
-    }
-    return flat_mutation_reader::impl::buffer_size();
-}
-
-std::vector<db::system_keyspace::range_estimates>
-size_estimates_mutation_reader::estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
-    // For each specified range, estimate (crudely) mean partition size and partitions count.
-    auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
-    auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
-    auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
-        return utf8_type->decompose(cf.first);
-    }));
-    boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
-        return utf8_type->less(n1, n2);
-    });
-    std::vector<db::system_keyspace::range_estimates> estimates;
-    for (auto& range : _slice.row_ranges(*_schema, pkey)) {
-        auto rows = boost::make_iterator_range(
-                virtual_row_iterator(cf_names, local_ranges),
-                virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
-        auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
-        for (auto&& r : rows_to_estimate) {
-            auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
-            estimates.push_back(estimate(cf, r.tokens));
-            if (estimates.size() >= _slice.partition_row_limit()) {
-                return estimates;
-            }
-        }
-    }
-    return estimates;
-}
-
-} // namespace size_estimates
-
-} // namespace db
--- a/db/size_estimates_virtual_reader.hh
+++ b/db/size_estimates_virtual_reader.hh
@@ -21,19 +21,33 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <boost/range/adaptor/indirected.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/find_if.hpp>
+
+#include "clustering_bounds_comparator.hh"
+#include "database.hh"
 #include "db/system_keyspace.hh"
+#include "dht/i_partitioner.hh"
 #include "mutation_reader.hh"
+#include "partition_range_compat.hh"
+#include "range.hh"
+#include "service/storage_service.hh"
+#include "stdx.hh"
+#include "mutation_fragment.hh"
+#include "sstables/sstables.hh"
+#include "db/timeout_clock.hh"

 namespace db {

 namespace size_estimates {

-struct token_range {
-    bytes start;
-    bytes end;
-};
-
 class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
+    struct token_range {
+        bytes start;
+        bytes end;
+    };
    schema_ptr _schema;
    const dht::partition_range* _prange;
    const query::partition_slice& _slice;
@@ -43,18 +57,267 @@ class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
    streamed_mutation::forwarding _fwd;
    flat_mutation_reader_opt _partition_reader;
 public:
-    size_estimates_mutation_reader(schema_ptr, const dht::partition_range&, const query::partition_slice&, streamed_mutation::forwarding);
+    size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
+            : impl(schema)
+            , _schema(std::move(schema))
+            , _prange(&prange)
+            , _slice(slice)
+            , _fwd(fwd)
+    { }

-    virtual future<> fill_buffer(db::timeout_clock::time_point) override;
-    virtual void next_partition() override;
-    virtual future<> fast_forward_to(const dht::partition_range&, db::timeout_clock::time_point) override;
-    virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point) override;
-    virtual size_t buffer_size() const override;
 private:
-    future<> get_next_partition();
+    future<> get_next_partition() {
+        // For each specified range, estimate (crudely) mean partition size and partitions count.
+        auto& db = service::get_local_storage_proxy().get_db().local();
+        if (!_keyspaces) {
+            _keyspaces = get_keyspaces(*_schema, db, *_prange);
+            _current_partition = _keyspaces->begin();
+        }
+        if (_current_partition == _keyspaces->end()) {
+            _end_of_stream = true;
+            return make_ready_future<>();
+        }
+        return get_local_ranges().then([&db, this] (auto&& ranges) {
+            auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
+            auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
+            ++_current_partition;
+            std::vector<mutation> ms;
+            ms.emplace_back(std::move(mutations));
+            _partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
+        });
+    }
+public:
+    virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
+        return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
+            if (!_partition_reader) {
+                return get_next_partition();
+            }
+            return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
+                push_mutation_fragment(std::move(mf));
+                return stop_iteration(is_buffer_full());
+            }, timeout).then([this] {
+                if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
+                    _partition_reader = stdx::nullopt;
+                }
+            });
+        });
+    }
+    virtual void next_partition() override {
+        clear_buffer_to_next_partition();
+        if (is_buffer_empty()) {
+            _partition_reader = stdx::nullopt;
+        }
+    }
+    virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
+        clear_buffer();
+        _prange = &pr;
+        _keyspaces = stdx::nullopt;
+        _partition_reader = stdx::nullopt;
+        _end_of_stream = false;
+        return make_ready_future<>();
+    }
+    virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
+        forward_buffer_to(pr.start());
+        _end_of_stream = false;
+        if (_partition_reader) {
+            return _partition_reader->fast_forward_to(std::move(pr), timeout);
+        }
+        return make_ready_future<>();
+    }
+    virtual size_t buffer_size() const override {
+        if (_partition_reader) {
+            return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
+        }
+        return flat_mutation_reader::impl::buffer_size();
+    }
+    /**
+     * Returns the primary ranges for the local node.
+     * Used for testing as well.
+     */
+    static future<std::vector<token_range>> get_local_ranges() {
+        auto& ss = service::get_local_storage_service();
+        return ss.get_local_tokens().then([&ss] (auto&& tokens) {
+            auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
+            std::vector<token_range> local_ranges;
+            auto to_bytes = [](const stdx::optional<dht::token_range::bound>& b) {
+                assert(b);
+                return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
+            };
+            // We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
+            // All queries will be on that table, where all entries are text and there's no notion of
+            // token ranges form the CQL point of view.
+            auto left_inf = boost::find_if(ranges, [] (auto&& r) {
+                return !r.start() || r.start()->value() == dht::minimum_token();
+            });
+            auto right_inf = boost::find_if(ranges, [] (auto&& r) {
+                return !r.end() || r.start()->value() == dht::maximum_token();
+            });
+            if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
+                local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
+                ranges.erase(left_inf);
+                ranges.erase(right_inf);
+            }
+            for (auto&& r : ranges) {
+                local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
+            }
+            boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
+                return utf8_type->less(tr1.start, tr2.start);
+            });
+            return local_ranges;
+        });
+    }
+private:
+    struct virtual_row {
+        const bytes& cf_name;
+        const token_range& tokens;
+        clustering_key_prefix as_key() const {
+            return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
+        }
+    };
+    struct virtual_row_comparator {
+        schema_ptr _schema;
+        virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
+        bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
+            return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
+        }
+        bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
+            return operator()(row.as_key(), key);
+        }
+        bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
+            return operator()(key, row.as_key());
+        }
+    };
+    class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
+        std::reference_wrapper<const std::vector<bytes>> _cf_names;
+        std::reference_wrapper<const std::vector<token_range>> _ranges;
+        size_t _cf_names_idx = 0;
+        size_t _ranges_idx = 0;
+    public:
+        struct end_iterator_tag {};
+        virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
+                : _cf_names(std::ref(cf_names))
+                , _ranges(std::ref(ranges))
+        { }
+        virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
+                : _cf_names(std::ref(cf_names))
+                , _ranges(std::ref(ranges))
+                , _cf_names_idx(cf_names.size())
+                , _ranges_idx(ranges.size())
+        { }
+        virtual_row_iterator& operator++() {
+            if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
+                _ranges_idx = 0;
+            }
+            return *this;
+        }
+        virtual_row_iterator operator++(int) {
+            virtual_row_iterator i(*this);
+            ++(*this);
+            return i;
+        }
+        const value_type operator*() const {
+            return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
+        }
+        bool operator==(const virtual_row_iterator& i) const {
+            return _cf_names_idx == i._cf_names_idx
+                && _ranges_idx == i._ranges_idx;
+        }
+        bool operator!=(const virtual_row_iterator& i) const {
+            return !(*this == i);
+        }
+    };

    std::vector<db::system_keyspace::range_estimates>
-    estimates_for_current_keyspace(const database&, std::vector<token_range> local_ranges) const;
+    estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
+        auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
+        auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
+        auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
+            return utf8_type->decompose(cf.first);
+        }));
+        boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
+            return utf8_type->less(n1, n2);
+        });
+        std::vector<db::system_keyspace::range_estimates> estimates;
+        for (auto& range : _slice.row_ranges(*_schema, pkey)) {
+            auto rows = boost::make_iterator_range(
+                    virtual_row_iterator(cf_names, local_ranges),
+                    virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
+            auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
+            for (auto&& r : rows_to_estimate) {
+                auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
+                estimates.push_back(estimate(cf, r.tokens));
+                if (estimates.size() >= _slice.partition_row_limit()) {
+                    return estimates;
+                }
+            }
+        }
+        return estimates;
+    }
+
+    /**
+     * Returns the keyspaces, ordered by name, as selected by the partition_range.
+     */
+    static ks_range get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
+        struct keyspace_less_comparator {
+            const schema& _s;
+            keyspace_less_comparator(const schema& s) : _s(s) { }
+            dht::ring_position as_ring_position(const sstring& ks) {
+                auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
+                return dht::global_partitioner().decorate_key(_s, std::move(pkey));
+            }
+            bool operator()(const sstring& ks1, const sstring& ks2) {
+                return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
+            }
+            bool operator()(const sstring& ks, const dht::ring_position& rp) {
+                return as_ring_position(ks).less_compare(_s, rp);
+            }
+            bool operator()(const dht::ring_position& rp, const sstring& ks) {
+                return rp.less_compare(_s, as_ring_position(ks));
+            }
+        };
+        auto keyspaces = db.get_non_system_keyspaces();
+        auto cmp = keyspace_less_comparator(s);
+        boost::sort(keyspaces, cmp);
+        return boost::copy_range<ks_range>(range.slice(keyspaces, std::move(cmp)));
+    }
+
+    /**
+     * Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
+     */
+    static dht::partition_range as_ring_position_range(dht::token_range& r) {
+        stdx::optional<range<dht::ring_position>::bound> start_bound, end_bound;
+        if (r.start()) {
+            start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
+        }
+        if (r.end()) {
+            end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
+        }
+        return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
+    }
+
+    /**
+     * Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
+     */
+    static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
+        int64_t count{0};
+        utils::estimated_histogram hist{0};
+        auto from_bytes = [] (auto& b) {
+            return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
+        };
+        dht::token_range_vector ranges;
+        compat::unwrap_into(
+            wrapping_range<dht::token>({{ from_bytes(r.start) }}, {{ from_bytes(r.end) }}),
+            dht::token_comparator(),
+            [&] (auto&& rng) { ranges.push_back(std::move(rng)); });
+        for (auto&& r : ranges) {
+            auto rp_range = as_ring_position_range(r);
+            for (auto&& sstable : cf.select_sstables(rp_range)) {
+                count += sstable->estimated_keys_for_range(r);
+                hist.merge(sstable->get_stats_metadata().estimated_row_size);
+            }
+        }
+        return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
+    }
 };

 struct virtual_reader {
@@ -69,12 +332,6 @@ struct virtual_reader {
    }
 };

-/**
- * Returns the primary ranges for the local node.
- * Used for testing as well.
- */
-future<std::vector<token_range>> get_local_ranges();
-
 } // namespace size_estimates

 } // namespace db
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -26,7 +26,6 @@
 #include "db/consistency_level_type.hh"
 #include "db/system_keyspace.hh"
 #include "schema_builder.hh"
-#include "timeout_config.hh"
 #include "types.hh"

 #include <seastar/core/reactor.hh>
@@ -98,17 +97,11 @@ future<> system_distributed_keyspace::stop() {
    return make_ready_future<>();
 }

-static const timeout_config internal_distributed_timeout_config = [] {
-    using namespace std::chrono_literals;
-    const auto t = 10s;
-    return timeout_config{ t, t, t, t, t, t, t };
-}();
-
 future<std::unordered_map<utils::UUID, sstring>> system_distributed_keyspace::view_status(sstring ks_name, sstring view_name) const {
    return _qp.process(
            sprint("SELECT host_id, status FROM %s.%s WHERE keyspace_name = ? AND view_name = ?", NAME, VIEW_BUILD_STATUS),
            db::consistency_level::ONE,
-            internal_distributed_timeout_config,
+            infinite_timeout_config,
            { std::move(ks_name), std::move(view_name) },
            false).then([this] (::shared_ptr<cql3::untyped_result_set> cql_result) {
        return boost::copy_range<std::unordered_map<utils::UUID, sstring>>(*cql_result
@@ -125,7 +118,7 @@ future<> system_distributed_keyspace::start_view_build(sstring ks_name, sstring
        return _qp.process(
                sprint("INSERT INTO %s.%s (keyspace_name, view_name, host_id, status) VALUES (?, ?, ?, ?)", NAME, VIEW_BUILD_STATUS),
                db::consistency_level::ONE,
-                internal_distributed_timeout_config,
+                infinite_timeout_config,
                { std::move(ks_name), std::move(view_name), std::move(host_id), "STARTED" },
                false).discard_result();
    });
@@ -136,7 +129,7 @@ future<> system_distributed_keyspace::finish_view_build(sstring ks_name, sstring
        return _qp.process(
                sprint("UPDATE %s.%s SET status = ? WHERE keyspace_name = ? AND view_name = ? AND host_id = ?", NAME, VIEW_BUILD_STATUS),
                db::consistency_level::ONE,
-                internal_distributed_timeout_config,
+                infinite_timeout_config,
                { "SUCCESS", std::move(ks_name), std::move(view_name), std::move(host_id) },
                false).discard_result();
    });
@@ -146,7 +139,7 @@ future<> system_distributed_keyspace::remove_view(sstring ks_name, sstring view_
    return _qp.process(
            sprint("DELETE FROM %s.%s WHERE keyspace_name = ? AND view_name = ?", NAME, VIEW_BUILD_STATUS),
            db::consistency_level::ONE,
-            internal_distributed_timeout_config,
+            infinite_timeout_config,
            { std::move(ks_name), std::move(view_name) },
            false).discard_result();
 }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1635,9 +1635,6 @@ void make(database& db, bool durable, bool volatile_testing_only) {
        auto cfg = ks.make_column_family_config(*table, db.get_config(), db.get_large_partition_handler());
        if (maybe_write_in_user_memory(table, db)) {
            cfg.dirty_memory_manager = &db._dirty_memory_manager;
-        } else {
-            cfg.memtable_scheduling_group = default_scheduling_group();
-            cfg.memtable_to_cache_scheduling_group = default_scheduling_group();
        }
        db.add_column_family(ks, table, std::move(cfg));
        maybe_add_virtual_reader(table, db);
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -461,7 +461,7 @@ bool ring_position::less_compare(const schema& s, const ring_position& other) co
    return tri_compare(s, other) < 0;
 }

-int ring_position_comparator::operator()(ring_position_view lh, ring_position_view rh) const {
+int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_position_view rh) {
    auto token_cmp = tri_compare(*lh._token, *rh._token);
    if (token_cmp) {
        return token_cmp;
@@ -482,6 +482,10 @@ int ring_position_comparator::operator()(ring_position_view lh, ring_position_vi
    }
 }

+int ring_position_comparator::operator()(ring_position_view lh, ring_position_view rh) const {
+    return ring_position_tri_compare(s, lh, rh);
+}
+
 int ring_position_comparator::operator()(ring_position_view lh, sstables::decorated_key_view rh) const {
    auto token_cmp = tri_compare(*lh._token, rh.token());
    if (token_cmp) {
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -529,6 +529,7 @@ public:
 // Such range includes all keys k such that v1 <= k < v2, with order defined by ring_position_comparator.
 //
 class ring_position_view {
+    friend int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_position_view rh);
    friend class ring_position_comparator;

    // Order is lexicographical on (_token, _key) tuples, where _key part may be missing, and
@@ -543,6 +544,7 @@ class ring_position_view {
    const partition_key* _key; // Can be nullptr
    int8_t _weight;
 public:
+    using token_bound = ring_position::token_bound;
    struct after_key_tag {};
    using after_key = bool_class<after_key_tag>;

@@ -578,6 +580,14 @@ public:
        return ring_position_view(after_key_tag(), view);
    }

+    static ring_position_view starting_at(const dht::token& t) {
+        return ring_position_view(t, token_bound::start);
+    }
+
+    static ring_position_view ending_at(const dht::token& t) {
+        return ring_position_view(t, token_bound::end);
+    }
+
    ring_position_view(const dht::ring_position& pos, after_key after = after_key::no)
        : _token(&pos.token())
        , _key(pos.has_key() ? &*pos.key() : nullptr)
@@ -605,17 +615,25 @@ public:
        , _weight(weight)
    { }

-    explicit ring_position_view(const dht::token& token, int8_t weight = -1)
+    explicit ring_position_view(const dht::token& token, token_bound bound = token_bound::start)
        : _token(&token)
        , _key(nullptr)
-        , _weight(weight)
+        , _weight(static_cast<std::underlying_type_t<token_bound>>(bound))
    { }

+    const dht::token& token() const { return *_token; }
    const partition_key* key() const { return _key; }

+    // Only when key() == nullptr
+    token_bound get_token_bound() const { return token_bound(_weight); }
+    // Only when key() != nullptr
+    after_key is_after_key() const { return after_key(_weight == 1); }
+
    friend std::ostream& operator<<(std::ostream&, ring_position_view);
 };

+int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_position_view rh);
+
 // Trichotomic comparator for ring order
 struct ring_position_comparator {
    const schema& s;
--- a/dht/range_streamer.cc
+++ b/dht/range_streamer.cc
@@ -324,11 +324,11 @@ future<> range_streamer::do_stream_async() {
                    for (auto& range : ranges_to_stream) {
                        range_vec.push_back(range);
                    }
-                    auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
+                    auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
                    logger.warn("{} with {} for keyspace={} failed, took {} seconds: {}", description, source, keyspace, t, std::current_exception());
                    throw;
                }
-                auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
+                auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
                logger.info("{} with {} for keyspace={} succeeded, took {} seconds", description, source, keyspace, t);
            });

--- a/dist/ami/files/.bash_profile
+++ b/dist/ami/files/.bash_profile
@@ -7,8 +7,99 @@ fi

 # User specific environment and startup programs

+. /usr/lib/scylla/scylla_lib.sh
+
 PATH=$PATH:$HOME/.local/bin:$HOME/bin

 export PATH

-~/.scylla_ami_login
+echo
+echo '   _____            _ _       _____  ____  '
+echo '  / ____|          | | |     |  __ \|  _ \ '
+echo ' | (___   ___ _   _| | | __ _| |  | | |_) |'
+echo '  \___ \ / __| | | | | |/ _` | |  | |  _ < '
+echo '  ____) | (__| |_| | | | (_| | |__| | |_) |'
+echo ' |_____/ \___|\__, |_|_|\__,_|_____/|____/ '
+echo '               __/ |                       '
+echo '              |___/                        '
+echo ''
+echo ''
+echo 'Nodetool:'
+echo '	nodetool help'
+echo 'CQL Shell:'
+echo '	cqlsh'
+echo 'More documentation available at: '
+echo '	http://www.scylladb.com/doc/'
+echo 'By default, Scylla sends certain information about this node to a data collection server. For information, see http://www.scylladb.com/privacy/'
+echo
+
+if [ `ec2_is_supported_instance_type` -eq 0 ]; then
+	TYPE=`curl -s http://169.254.169.254/latest/meta-data/instance-type`
+	tput setaf 1
+	tput bold
+	echo "    $TYPE is not supported instance type!"
+	tput sgr0
+	echo -n "To continue startup ScyllaDB on this instance, run 'sudo scylla_io_setup' "
+	echo "then 'systemctl start scylla-server'."
+	echo "For a list of optimized instance types and more EC2 instructions see http://www.scylladb.com/doc/getting-started-amazon/"
+	echo
+else
+	SETUP=`systemctl is-active scylla-ami-setup`
+	if [ "$SETUP" == "activating" ]; then
+		tput setaf 4
+		tput bold
+		echo "    Constructing RAID volume..."
+		tput sgr0
+		echo
+		echo "Please wait for setup. To see status, run "
+		echo " 'systemctl status scylla-ami-setup'"
+		echo
+		echo "After setup finished, scylla-server service will launch."
+		echo "To see status of scylla-server, run "
+		echo " 'systemctl status scylla-server'"
+		echo
+	elif [ "$SETUP" == "failed" ]; then
+		tput setaf 1
+		tput bold
+		echo "    AMI initial configuration failed!"
+		tput sgr0
+		echo
+		echo "To see status, run "
+		echo " 'systemctl status scylla-ami-setup'"
+		echo
+	else
+		SCYLLA=`systemctl is-active scylla-server`
+		if [ "$SCYLLA" == "activating" ]; then
+			tput setaf 4
+			tput bold
+			echo "    ScyllaDB is starting..."
+			tput sgr0
+			echo
+			echo "Please wait for start. To see status, run "
+			echo " 'systemctl status scylla-server'"
+			echo
+		elif [ "$SCYLLA" == "active" ]; then
+			tput setaf 4
+			tput bold
+			echo "    ScyllaDB is active."
+			tput sgr0
+			echo
+			echo "$ nodetool status"
+			echo
+			nodetool status
+		else
+			tput setaf 1
+			tput bold
+			echo "    ScyllaDB is not started!"
+			tput sgr0
+			echo "Please wait for startup. To see status of ScyllaDB, run "
+			echo " 'systemctl status scylla-server'"
+			echo
+		fi
+	fi
+	echo -n "    "
+	/usr/lib/scylla/scylla_ec2_check
+	if [ $? -eq 0 ]; then
+	    echo
+	fi
+fi
--- a/dist/ami/files/.scylla_ami_login
+++ b/dist/ami/files/.scylla_ami_login
@@ -1,118 +0,0 @@
-#!/usr/bin/python3
-#
-# Copyright 2018 ScyllaDB
-#
-
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-import os
-import sys
-import argparse
-sys.path.append('/usr/lib/scylla')
-from scylla_util import *
-
-MSG_HEADER = '''
-
-   _____            _ _       _____  ____  
-  / ____|          | | |     |  __ \|  _ \ 
- | (___   ___ _   _| | | __ _| |  | | |_) |
-  \___ \ / __| | | | | |/ _` | |  | |  _ < 
-  ____) | (__| |_| | | | (_| | |__| | |_) |
- |_____/ \___|\__, |_|_|\__,_|_____/|____/ 
-               __/ |                       
-              |___/                        
-
-
-Nodetool:
-	nodetool help
-CQL Shell:
-	cqlsh
-More documentation available at: 
-	http://www.scylladb.com/doc/
-By default, Scylla sends certain information about this node to a data collection server. For information, see http://www.scylladb.com/privacy/
-
-'''[1:-1]
-MSG_UNSUPPORTED_INSTANCE_TYPE = '''
-    {red}{type} is not supported instance type!{nocolor}
-To continue startup ScyllaDB on this instance, run 'sudo scylla_io_setup' then 'systemctl start scylla-server'.
-For a list of optimized instance types and more EC2 instructions see http://www.scylladb.com/doc/getting-started-amazon/"
-
-'''[1:-1]
-MSG_SETUP_ACTIVATING = '''
-    {green}Constructing RAID volume...{nocolor}
-
-Please wait for setup. To see status, run 
- 'systemctl status scylla-ami-setup'
-
-After setup finished, scylla-server service will launch.
-To see status of scylla-server, run 
- 'systemctl status scylla-server'
-
-'''[1:-1]
-MSG_SETUP_FAILED = '''
-    {red}AMI initial configuration failed!{nocolor}
-
-To see status, run 
- 'systemctl status scylla-ami-setup'
-
-'''[1:-1]
-MSG_SCYLLA_ACTIVATING = '''
-    {green}ScyllaDB is starting...{nocolor}
-
-Please wait for start. To see status, run 
- 'systemctl status scylla-server'
-
-'''[1:-1]
-MSG_SCYLLA_FAILED = '''
-    {red}ScyllaDB is not started!{nocolor}
-Please wait for startup. To see status of ScyllaDB, run 
- 'systemctl status scylla-server'
-
-'''[1:-1]
-MSG_SCYLLA_ACTIVE = '''
-    {green}ScyllaDB is active.{nocolor}
-
-$ nodetool status
-
-'''[1:-1]
-
-if __name__ == '__main__':
-    colorprint(MSG_HEADER)
-    aws = aws_instance()
-    if not aws.is_supported_instance_class():
-        colorprint(MSG_UNSUPPORTED_INSTANCE_TYPE, type=aws.instance_class())
-    else:
-        setup = systemd_unit('scylla-ami-setup.service')
-        res = setup.is_active()
-        if res == 'activating':
-            colorprint(MSG_SETUP_ACTIVATING)
-        elif res == 'failed':
-            colorprint(MSG_SETUP_FAILED)
-        else:
-            server = systemd_unit('scylla-server.service')
-            res = server.is_active()
-            if res == 'activating':
-                colorprint(MSG_SCYLLA_ACTIVATING)
-            elif res == 'failed':
-                colorprint(MSG_SCYLLA_FAILED)
-            else:
-                colorprint(MSG_SCYLLA_ACTIVE)
-                run('nodetool status', exception=False)
-        print('    ', end='')
-        res = run('/usr/lib/scylla/scylla_ec2_check --nic eth0', exception=False)
-        if res == 0:
-            print('')
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/ami/scylla.json
+++ b/dist/ami/scylla.json
@@ -67,8 +67,6 @@
    {
      "type": "shell",
      "inline": [
-         "sudo yum install -y epel-release",
-         "sudo yum install -y python36",
         "sudo /home/{{user `ssh_username`}}/scylla-ami/scylla_install_ami {{ user `install_args` }}"
       ]
    }
--- a/dist/common/scripts/node_exporter_install
+++ b/dist/common/scripts/node_exporter_install
@@ -1,8 +1,6 @@
-#!/usr/bin/python3
+#!/bin/sh
 #
-# Copyright 2018 ScyllaDB
-#
-
+# Copyright 2016 ScyllaDB
 #
 # This file is part of Scylla.
 #
@@ -19,46 +17,42 @@
 # You should have received a copy of the GNU General Public License
 # along with Scylla.  If not, see <http://www.gnu.org/licenses/>.

-import os
-import sys
-import tempfile
-import tarfile
-from scylla_util import *
+if [ "`id -u`" -ne 0 ]; then
+    echo "Requires root permission."
+    exit 1
+fi

-VERSION='0.14.0'
-INSTALL_DIR='/usr/lib/scylla/Prometheus/node_exporter'
+if [ -f /usr/bin/node_exporter ] || [ -f /usr/bin/prometheus-node_exporter ]; then
+    echo "node_exporter already installed"
+    exit 1
+fi

-if __name__ == '__main__':
-    if os.getuid() > 0:
-        print('Requires root permission.')
-        sys.exit(1)
+. /usr/lib/scylla/scylla_lib.sh

-    if os.path.exists('/usr/bin/node_exporter') or os.path.exists('/usr/bin/prometheus-node_exporter'):
-        print('node_exporter already installed')
-        sys.exit(1)
+if is_gentoo_variant; then
+    emerge -uq app-metrics/node_exporter
+    if is_systemd; then
+       echo "app-metrics/node_exporter does not install systemd service files, please fill a bug if you need them."
+    else
+        rc-update add node_exporter default
+        rc-service node_exporter start
+    fi
+else
+    version=0.14.0
+    dir=/usr/lib/scylla/Prometheus/node_exporter
+    mkdir -p $dir
+    cd $dir
+    curl -L https://github.com/prometheus/node_exporter/releases/download/v$version/node_exporter-$version.linux-amd64.tar.gz -o $dir/node_exporter-$version.linux-amd64.tar.gz
+    tar -xvzf $dir/node_exporter-$version.linux-amd64.tar.gz
+    rm $dir/node_exporter-$version.linux-amd64.tar.gz
+    ln -s $dir/node_exporter-$version.linux-amd64/node_exporter /usr/bin
+    . /etc/os-release

-    if is_gentoo_variant():
-        run('emerge -uq app-metrics/node_exporter')
-        if is_systemd():
-            print('app-metrics/node_exporter does not install systemd service files, please fill a bug if you need them.')
-            sys.exit(1)
-        else:
-            run('rc-update add node_exporter default')
-            run('rc-service node_exporter start')
-    else:
-        data = curl('https://github.com/prometheus/node_exporter/releases/download/v{version}/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION), byte=True)
-        with open('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION), 'wb') as f:
-            f.write(data)
-        with tarfile.open('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION)) as tf:
-            tf.extractall(INSTALL_DIR)
-        os.remove('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION))
-        os.symlink('{install_dir}/node_exporter-{version}.linux-amd64/node_exporter'.format(install_dir=INSTALL_DIR, version=VERSION), '/usr/bin/node_exporter')
-        if is_systemd():
-            node_exporter = systemd_unit('node-exporter.service')
-            node_exporter.enable()
-            node_exporter.start()
-        else:
-            conf = '''
+     if is_systemd; then
+        systemctl enable node-exporter
+        systemctl start node-exporter
+    else
+        cat <<EOT >> /etc/init/node_exporter.conf
 # Run node_exporter

 start on startup
@@ -66,9 +60,9 @@ start on startup
 script
   /usr/bin/node_exporter
 end script
-'''[1:-1]
-            with open('/etc/init/node_exporter.conf', 'w') as f:
-                f.write(conf)
-            run('service node_exporter start')
+EOT
+        service node_exporter start
+    fi
+fi

-    print('node_exporter successfully installed')
+printf "node_exporter successfully installed\n"
--- a/dist/common/scripts/scylla_ec2_check
+++ b/dist/common/scripts/scylla_ec2_check
@@ -24,38 +24,46 @@ import sys
 import argparse
 from scylla_util import *

+def get_en_interface_type():
+    type, subtype = curl('http://169.254.169.254/latest/meta-data/instance-type').split('.')
+    if type in ['c3', 'c4', 'd2', 'i2', 'r3']:
+        return 'ixgbevf'
+    if type in ['c5', 'c5d', 'f1', 'g3', 'h1', 'i3', 'm5', 'm5d', 'p2', 'p3', 'r4', 'x1']:
+        return 'ena'
+    if type == 'm4':
+        if subtype == '16xlarge':
+            return 'ena'
+        else:
+            return 'ixgbevf'
+
+def is_vpc_enabled():
+    with open('/sys/class/net/eth0/address') as f:
+        mac = f.read().strip()
+    mac_stat = curl('http://169.254.169.254/latest/meta-data/network/interfaces/macs/{}/'.format(mac))
+    return True if re.search(r'^vpc-id$', mac_stat, flags=re.MULTILINE) else False
+
+
 if __name__ == '__main__':
    if not is_ec2():
        sys.exit(0)
-    parser = argparse.ArgumentParser(description='Verify EC2 configuration is optimized.')
-    parser.add_argument('--nic', default='eth0',
-                        help='specify NIC')
-    args = parser.parse_args()

-    if not is_valid_nic(args.nic):
-        print('NIC {} doesn\'t exist.'.format(args.nic))
-        sys.exit(1)
-
-    aws = aws_instance()
-    instance_class = aws.instance_class()
-    en = aws.get_en_interface_type()
-    match = re.search(r'^driver: (\S+)$', out('ethtool -i {}'.format(args.nic)), flags=re.MULTILINE)
+    type = curl('http://169.254.169.254/latest/meta-data/instance-type')
+    en = get_en_interface_type()
+    match = re.search(r'^driver: (\S+)$', out('ethtool -i eth0'), flags=re.MULTILINE)
    driver = match.group(1)

    if not en:
-        colorprint('{red}{instance_class} doesn\'t support enahanced networking!{nocolor}', instance_class=instance_class)
+        print('{bold_red}{type} doesn\'t support enahanced networking!{no_color}'.format(bold_red=concolor.BOLD_RED, type=type, no_color=concolor.NO_COLOR))
        print('''To enable enhanced networking, please use the instance type which supports it.
 More documentation available at:
 http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking.html#enabling_enhanced_networking''')
        sys.exit(1)
-    elif not aws.is_vpc_enabled(args.nic):
-        colorprint('{red}VPC is not enabled!{nocolor}')
+    elif not is_vpc_enabled():
+        print('{bold_red}VPC is not enabled!{no_color}'.format(bold_red=concolor.BOLD_RED, no_color=concolor.NO_COLOR))
        print('To enable enhanced networking, please enable VPC.')
        sys.exit(1)
    elif driver != en:
-        colorprint('{red}Enhanced networking is disabled!{nocolor}')
+        print('{bold_red}Enhanced networking is disabled!{no_color}'.format(bold_red=concolor.BOLD_RED, no_color=concolor.NO_COLOR))
        print('''More documentation available at:
 http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking.html''')
        sys.exit(1)
-
-    colorprint('{green}This EC2 instance is optimized for Scylla.{nocolor}')
--- a/dist/common/scripts/scylla_lib.sh
+++ b/dist/common/scripts/scylla_lib.sh
@@ -0,0 +1,122 @@
+#
+#  Copyright (C) 2016 ScyllaDB
+
+is_debian_variant() {
+    [ -f /etc/debian_version ]
+}
+
+is_redhat_variant() {
+    [ -f /etc/redhat-release ]
+}
+
+is_gentoo_variant() {
+    [ -f /etc/gentoo-release ]
+}
+
+is_systemd() {
+    grep -q '^systemd$' /proc/1/comm
+}
+
+is_ec2() {
+    [ -f /sys/hypervisor/uuid ] && [ "$(head -c 3 /sys/hypervisor/uuid)" = "ec2" ]
+}
+
+is_selinux_enabled() {
+    STATUS=`getenforce`
+    if [ "$STATUS" = "Disabled" ]; then
+        return 0
+    else
+        return 1
+    fi
+}
+
+ec2_is_supported_instance_type() {
+    TYPE=`curl -s http://169.254.169.254/latest/meta-data/instance-type|cut -d . -f 1`
+    case $TYPE in
+           "i2"|"i3") echo 1;;
+            *) echo 0;;
+    esac
+}
+
+verify_args() {
+     if [ -z "$2" ] || [[ "$2" =~ ^--+ ]]; then
+        echo "Requires more parameter for $1."
+        print_usage
+        exit 1
+    fi
+}
+
+#
+#  get_mode_cpu_set <mode name, e.g. 'mq', 'sq', 'sq_split'>
+#
+get_mode_cpu_set() {
+    local mode=$1
+    local mode_cpu_mask=`/usr/lib/scylla/perftune.py --tune net --nic "$nic" --mode "$mode" --get-cpu-mask` 2>&-
+
+    # If the given mode is not supported - return invalid CPU set
+    if [[ "$?" -ne "0" ]]; then
+        echo "-1"
+    else
+        echo "$mode_cpu_mask" | /usr/lib/scylla/hex2list.py
+    fi
+}
+
+#
+# check_cpuset_conf <NIC name>
+#
+get_tune_mode() {
+    local nic=$1
+
+    # if cpuset.conf doesn't exist use the default mode
+    [[ ! -e '/etc/scylla.d/cpuset.conf' ]] && return
+
+    local cur_cpuset=`cat /etc/scylla.d/cpuset.conf | cut -d "\"" -f2- | cut -d" " -f2`
+    local mq_cpuset=`get_mode_cpu_set 'mq'`
+    local sq_cpuset=`get_mode_cpu_set 'sq'`
+    local sq_split_cpuset=`get_mode_cpu_set 'sq_split'`
+    local tune_mode=""
+
+    case "$cur_cpuset" in
+        "$mq_cpuset")
+            tune_mode="--mode mq"
+            ;;
+        "$sq_cpuset")
+            tune_mode="--mode sq"
+            ;;
+        "$sq_split_cpuset")
+            tune_mode="--mode sq_split"
+            ;;
+    esac
+
+    # if cpuset is something different from what we expect - use the default mode
+    echo "$tune_mode"
+}
+
+#
+# create_perftune_conf [<NIC name>]
+#
+create_perftune_conf() {
+    local nic=$1
+    [[ -z "$nic" ]] && nic='eth0'
+
+    # if exists - do nothing
+    [[ -e '/etc/scylla.d/perftune.yaml' ]] && return
+
+    local mode=`get_tune_mode "$nic"`
+    /usr/lib/scylla/perftune.py --tune net --nic "$nic" $mode --dump-options-file > /etc/scylla.d/perftune.yaml
+}
+
+. /etc/os-release
+if is_debian_variant || is_gentoo_variant; then
+    SYSCONFIG=/etc/default
+else
+    SYSCONFIG=/etc/sysconfig
+fi
+. $SYSCONFIG/scylla-server
+
+for i in /etc/scylla.d/*.conf; do
+    if [ "$i" = "/etc/scylla.d/*.conf" ]; then
+        break
+    fi
+    . "$i"
+done
--- a/dist/common/scripts/scylla_ntp_setup
+++ b/dist/common/scripts/scylla_ntp_setup
@@ -49,8 +49,7 @@ if __name__ == '__main__':
        if is_systemd():
            ntp = systemd_unit('ntp.service')
            ntp.stop()
-            # ignore error, ntpd may able to adjust clock later
-            run('ntpdate ntp.ubuntu.com', exception=False)
+            run('ntpdate ntp.ubuntu.com')
            ntp.start()
        else:
            run('service ntp stop')
@@ -71,8 +70,7 @@ if __name__ == '__main__':
            sntpd.start()
        else:
            run('rc-service ntpd stop', exception=False)
-            # ignore error, ntpd may able to adjust clock later
-            run('ntpdate {}'.format(server), exception=False)
+            run('ntpdate {}'.format(server))
            run('rc-update add ntpd default')
            run('rc-service ntpd start')

@@ -89,7 +87,6 @@ if __name__ == '__main__':
        server = match.group(1)
        ntpd = systemd_unit('ntpd.service')
        ntpd.stop()
-        # ignore error, ntpd may able to adjust clock later
-        run('ntpdate {}'.format(server), exception=False)
+        run('ntpdate {}'.format(server))
        ntpd.enable()
        ntpd.start()
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -1,71 +1,33 @@
-#!/usr/bin/python3
-#
-# Copyright 2018 ScyllaDB
-#
+#!/bin/bash -e

-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+. /usr/lib/scylla/scylla_lib.sh

-import os
-import sys
-import glob
-from scylla_util import *
+if [ "$AMI" = "yes" ] && [ -f /etc/scylla/ami_disabled ]; then
+    rm /etc/scylla/ami_disabled
+    exit 1
+fi

-if __name__ == '__main__':
-    if os.getuid() > 0:
-        print('Requires root permission.')
-        sys.exit(1)
-    if is_redhat_variant():
-        cfg = sysconfig_parser('/etc/sysconfig/scylla-server')
-    else:
-        cfg = sysconfig_parser('/etc/default/scylla-server')
-    ami = cfg.get('AMI')
-    mode = cfg.get('NETWORK_MODE')
+if [ "$NETWORK_MODE" = "virtio" ]; then
+    ip tuntap del mode tap dev $TAP
+    ip tuntap add mode tap dev $TAP user $USER one_queue vnet_hdr
+    ip link set dev $TAP up
+    ip link set dev $TAP master $BRIDGE
+    chown $USER.$GROUP /dev/vhost-net
+elif [ "$NETWORK_MODE" = "dpdk" ]; then
+    modprobe uio
+    modprobe uio_pci_generic
+    /usr/lib/scylla/dpdk-devbind.py --force --bind=uio_pci_generic $ETHPCIID
+    for n in /sys/devices/system/node/node?; do
+        echo $NR_HUGEPAGES > $n/hugepages/hugepages-2048kB/nr_hugepages
+    done
+    if [ "$ID" = "ubuntu" ]; then
+        hugeadm --create-mounts
+    fi
+else # NETWORK_MODE = posix
+    if [ "$SET_NIC" = "yes" ]; then
+        create_perftune_conf "$IFNAME"
+        /usr/lib/scylla/posix_net_conf.sh $IFNAME --options-file /etc/scylla.d/perftune.yaml
+    fi
+fi

-    if ami == 'yes' and os.path.exists('/etc/scylla/ami_disabled'):
-        os.remove('/etc/scylla/ami_disabled')
-        sys.exit(1)
-
-    if mode == 'virtio':
-        tap = cfg.get('TAP')
-        user = cfg.get('USER')
-        group = cfg.get('GROUP')
-        bridge = cfg.get('BRIDGE')
-        run('ip tuntap del mode tap dev {TAP}'.format(TAP=tap))
-        run('ip tuntap add mode tap dev {TAP} user {USER} one_queue vnet_hdr'.format(TAP=tap, USER=user))
-        run('ip link set dev {TAP} up'.format(TAP=tap))
-        run('ip link set dev {TAP} master {BRIDGE}'.format(TAP=tap, BRIDGE=bridge))
-        run('chown {USER}.{GROUP} /dev/vhost-net'.format(USER=user, GROUP=group))
-    elif mode == 'dpdk':
-        ethpcciid = cfg.get('ETHPCIID')
-        nr_hugepages = cfg.get('NR_HUGEPAGES')
-        run('modprobe uio')
-        run('modprobe uio_pci_generic')
-        run('/usr/lib/scylla/dpdk-devbind.py --force --bind=uio_pci_generic {ETHPCIID}'.format(ETHPCIID=ethpciid))
-        for n in glob.glob('/sys/devices/system/node/node?'):
-            with open('{n}/hugepages/hugepages-2048kB/nr_hugepages'.format(n=n), 'w') as f:
-                f.write(nr_hugepages)
-        if dist_name() == 'Ubuntu': 
-            run('hugeadm --create-mounts')
-        fi
-    else:
-        set_nic = cfg.get('SET_NIC')
-        ifname = cfg.get('IFNAME')
-        if set_nic  == 'yes':
-            create_perftune_conf(ifname)
-            run('/usr/lib/scylla/posix_net_conf.sh {IFNAME} --options-file /etc/scylla.d/perftune.yaml'.format(IFNAME=ifname))
-
-    run('/usr/lib/scylla/scylla-blocktune')
+/usr/lib/scylla/scylla-blocktune
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -146,15 +146,7 @@ if __name__ == '__main__':
        match = re.search(r'^/dev/\S+: (UUID="\S+")', res.strip())
        uuid = match.group(1)
        with open('/etc/fstab', 'a') as f:
-            f.write('{uuid} {mount_at} xfs noatime,nofail 0 0\n'.format(uuid=uuid, mount_at=mount_at))
-        mounts_conf = '/etc/systemd/system/scylla-server.service.d/mounts.conf'
-        if not os.path.exists(mounts_conf):
-            makedirs('/etc/systemd/system/scylla-server.service.d/')
-            with open(mounts_conf, 'w') as f:
-                f.write('[Unit]\nRequiresMountsFor={mount_at}\n'.format(mount_at=mount_at))
-        else:
-            with open(mounts_conf, 'a') as f:
-                f.write('RequiresMountsFor={mount_at}\n'.format(mount_at=mount_at))
+            f.write('{uuid} {mount_at} xfs noatime 0 0\n'.format(uuid=uuid, mount_at=mount_at))

    if is_debian_variant():
        run('update-initramfs -u')
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -48,21 +48,6 @@ def interactive_ask_service(msg1, msg2, default = None):
        elif ans == 'no' or ans =='n':
            return False

-def interactive_choose_nic():
-    nics = [os.path.basename(n) for n in glob.glob('/sys/class/net/*') if n != '/sys/class/net/lo']
-    if len(nics) == 0:
-        print('A NIC was not found.')
-        sys.exit(1)
-    elif len(nics) == 1:
-        return nics[0]
-    else:
-        print('Please select a NIC from the following list:')
-        while True:
-            print(nics)
-            n = input('> ')
-            if is_valid_nic(n):
-                return n
-
 def do_verify_package(pkg):
    if is_debian_variant():
        res = run('dpkg -s {}'.format(pkg), silent=True, exception=False)
@@ -102,7 +87,7 @@ def run_setup_script(name, script):
    res = run(script, exception=False)
    if res != 0:
        if interactive:
-            colorprint('{red}{name} setup failed. Press any key to continue...{nocolor}', name=name)
+            print('{red}{name} setup failed. Press any key to continue...{no_color}'.format(red=concolor.BOLD_RED, name=name, no_color=concolor.NO_COLOR))
            input()
        else:
            print('{} setup failed.'.format(name))
@@ -116,7 +101,7 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Configure environment for Scylla.')
    parser.add_argument('--disks',
                        help='specify disks for RAID')
-    parser.add_argument('--nic', default='eth0',
+    parser.add_argument('--nic',
                        help='specify NIC')
    parser.add_argument('--ntp-domain',
                        help='specify NTP domain')
@@ -127,7 +112,7 @@ if __name__ == '__main__':
    parser.add_argument('--developer-mode', action='store_true', default=False,
                        help='enable developer mode')
    parser.add_argument('--no-ec2-check', action='store_true', default=False,
-                        help='skip EC2 configuration check')
+                        help='skip EC2 configuration check(only on EC2)')
    parser.add_argument('--no-kernel-check', action='store_true', default=False,
                        help='skip kernel version check')
    parser.add_argument('--no-verify-package', action='store_true', default=False,
@@ -162,14 +147,12 @@ if __name__ == '__main__':
    if len(sys.argv) == 1:
        interactive = True

-    if not interactive:
-        if not args.no_raid_setup and not args.disks:
-            parser.print_help()
-            sys.exit(1)
-        if not args.no_sysconfig_setup or (is_ec2() and not args.no_ec2_check):
-            if not is_valid_nic(args.nic):
-                print('NIC {} doesn\'t exist.'.format(args.nic))
-                sys.exit(1)
+    if not interactive and not args.no_raid_setup and not args.disks:
+        parser.print_help()
+        sys.exit(1)
+    if not interactive and not args.no_sysconfig_setup and not args.nic:
+        parser.print_help()
+        sys.exit(1)

    disks = args.disks
    nic = args.nic
@@ -192,16 +175,13 @@ if __name__ == '__main__':
    fstrim_setup = not args.no_fstrim_setup
    selinux_reboot_required = False

-    if interactive:
-        colorprint('{green}Skip any of the following steps by answering \'no\'{nocolor}')
+    print('{green}Skip any of the following steps by answering \'no\'{no_color}'.format(green=concolor.GREEN, no_color=concolor.NO_COLOR))

    if is_ec2():
        if interactive:
-            ec2_check = interactive_ask_service('Do you want to run Amazon EC2 configuration check?', 'Yes - runs a script to verify that this instance is optimized for running Scylla. No - skips the configuration check.', 'yes')
-            if ec2_check:
-                nic = interactive_choose_nic()
+            ec2_check = interactive_ask_service('Do you want to run Amazon EC2 configuration check?', 'Yes - runs a script to verify that this instance is optimized for running Scylls. No - skips the configuration check.', 'yes')
        if ec2_check:
-            run('/usr/lib/scylla/scylla_ec2_check --nic {}'.format(nic))
+            run('/usr/lib/scylla/scylla_ec2_check')

    if interactive:
        kernel_check = interactive_ask_service('Do you want to run check your kernel version?', 'Yes - runs a  script to verify that the kernel for this instance qualifies to run Scylla. No - skips the kernel check.', 'yes')
@@ -222,7 +202,7 @@ if __name__ == '__main__':
        elif is_gentoo_variant():
            run('rc-update add scylla-server default')

-    if interactive and not os.path.exists('/etc/scylla.d/housekeeping.cfg'):
+    if interactive:
        version_check = interactive_ask_service('Do you want to enable Scylla to check if there is a newer version of Scylla available?', 'Yes - start the Scylla-housekeeping service to check for a newer version. This check runs periodically. No - skips this step.', 'yes')
    if version_check:
        with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
@@ -294,7 +274,7 @@ if __name__ == '__main__':
                print('Please select unmounted disks from the following list: {}'.format(devices))
            selected = []
            dsklist = []
-            while True:
+            while len(devices):
                print('type \'cancel\' to cancel RAID/XFS setup.')
                print('type \'done\' to finish selection. Selected: {}'.format(selected))
                if len(dsklist) > 0:
@@ -336,9 +316,21 @@ if __name__ == '__main__':
    if interactive:
        sysconfig_setup = interactive_ask_service('Do you want to setup a system-wide customized configuration for Scylla?', 'Yes - setup the sysconfig file. No - skips this step.', 'yes')
    if sysconfig_setup:
-        nic = interactive_choose_nic()
-        if interactive:
-            set_nic = interactive_ask_service('Do you want to enable Network Interface Card (NIC) optimization?', 'Yes - optimize the NIC queue settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
+        nics = [os.path.basename(n) for n in glob.glob('/sys/class/net/*') if n != '/sys/class/net/lo']
+        if len(nics) == 0:
+            print('A NIC was not found.')
+            sys.exit(1)
+        elif len(nics) == 1:
+            nic=nics[0]
+        else:
+            print('Please select a NIC from the following list:')
+            while True:
+                print(nics)
+                n = input('> ')
+                if os.path.exists('/sys/class/net/{}'.format(n)):
+                    nic = n
+                    break
+        set_nic = interactive_ask_service('Do you want to enable Network Interface Card (NIC) optimization?', 'Yes - optimize the NIC queue settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
    if sysconfig_setup:
        setup_args = '--setup-nic' if set_nic else ''
        run_setup_script('NIC queue', '/usr/lib/scylla/scylla_sysconfig_setup --nic {nic} {setup_args}'.format(nic=nic, setup_args=setup_args))
--- a/dist/common/scripts/scylla_stop
+++ b/dist/common/scripts/scylla_stop
@@ -1,40 +1,10 @@
-#!/usr/bin/python3
-#
-# Copyright 2018 ScyllaDB
-#
+#!/bin/bash -e

-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+. /usr/lib/scylla/scylla_lib.sh

-import os
-import sys
-from scylla_util import *
-
-if __name__ == '__main__':
-    if os.getuid() > 0:
-        print('Requires root permission.')
-        sys.exit(1)
-    if is_redhat_variant():
-        cfg = sysconfig_parser('/etc/sysconfig/scylla-server')
-    else:
-        cfg = sysconfig_parser('/etc/default/scylla-server')
-
-    
-    if cfg.get('NETWORK_MODE') == 'virtio':
-        run('ip tuntap del mode tap dev {TAP}'.format(TAP=cfg.get('TAP')))
-    elif cfg.get('NETWORK_MODE') == 'dpdk':
-        run('/usr/lib/scylla/dpdk-devbind.py -u {ETHPCIID}'.format(ETHPCIID=cfg.get('ETHPCIID')))
-        run('/usr/lib/scylla/dpdk-devbind.py -b {ETHDRV} {ETHPCIID}'.format(ETHDRV=cfg.get('ETHDRV'), ETHPCIID=cfg.get('ETHPCIID')))
+if [ "$NETWORK_MODE" = "virtio" ]; then
+    ip tuntap del mode tap dev $TAP
+elif [ "$NETWORK_MODE" = "dpdk" ]; then
+    /usr/lib/scylla/dpdk-devbind.py -u $ETHPCIID
+    /usr/lib/scylla/dpdk-devbind.py -b $ETHDRV $ETHPCIID
+fi
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -64,7 +64,7 @@ if __name__ == '__main__':
                        help='AMI instance mode')
    args = parser.parse_args()

-    if args.nic and not is_valid_nic(args.nic):
+    if args.nic and not os.path.exists('/sys/class/net/{}'.format(args.nic)):
            print('NIC {} not found.'.format(args.nic))
            sys.exit(1)

--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -29,17 +29,13 @@ import io
 import shlex
 import shutil

-def curl(url, byte=False):
+def curl(url):
    max_retries = 5
    retries = 0
    while True:
        try:
            req = urllib.request.Request(url)
-            with urllib.request.urlopen(req) as res:
-                if byte:
-                    return res.read()
-                else:
-                    return res.read().decode('utf-8')
+            return urllib.request.urlopen(req).read().decode('utf-8')
        except urllib.error.HTTPError:
            logging.warn("Failed to grab %s..." % url)
            time.sleep(5)
@@ -84,10 +80,6 @@ class aws_instance:
                continue
            self._disks[t] += [ self.__xenify(dev) ]

-    def __mac_address(self, nic='eth0'):
-        with open('/sys/class/net/{}/address'.format(nic)) as f:
-            return f.read().strip()
-
    def __init__(self):
        self._type = self.__instance_metadata("instance-type")
        self.__populate_disks()
@@ -104,25 +96,6 @@ class aws_instance:
        """Returns the class of the instance we are running in. i.e.: i3"""
        return self._type.split(".")[0]

-    def is_supported_instance_class(self):
-        if self.instance_class() in ['i2', 'i3']:
-            return True
-        return False
-
-    def get_en_interface_type(self):
-        instance_class = self.instance_class()
-        instance_size = self.instance_size()
-        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
-            return 'ixgbevf'
-        if instance_class in ['c5', 'c5d', 'f1', 'g3', 'h1', 'i3', 'm5', 'm5d', 'p2', 'p3', 'r4', 'x1']:
-            return 'ena'
-        if instance_class == 'm4':
-            if instance_size == '16xlarge':
-                return 'ena'
-            else:
-                return 'ixgbevf'
-        return None
-
    def disks(self):
        """Returns all disks in the system, as visible from the AWS registry"""
        disks = set()
@@ -161,11 +134,6 @@ class aws_instance:
        """Returns the private IPv4 address of this instance"""
        return self.__instance_metadata("local-ipv4")

-    def is_vpc_enabled(self, nic='eth0'):
-        mac = self.__mac_address(nic)
-        mac_stat = self.__instance_metadata('network/interfaces/macs/{}'.format(mac))
-        return True if re.search(r'^vpc-id$', mac_stat, flags=re.MULTILINE) else False
-

 ## Regular expression helpers
 # non-advancing comment matcher
@@ -255,24 +223,37 @@ class scylla_cpuinfo:
            return len(self._cpu_data["system"])

 def run(cmd, shell=False, silent=False, exception=True):
-    stdout=subprocess.DEVNULL if silent else None
-    stderr=subprocess.DEVNULL if silent else None
-    if not shell:
-        cmd = shlex.split(cmd)
-    if exception:
-        return subprocess.check_call(cmd, shell=shell, stdout=stdout, stderr=stderr)
+    stdout=None
+    stderr=None
+    if silent:
+        stdout=subprocess.DEVNULL
+        stderr=subprocess.DEVNULL
+    if shell:
+        if exception:
+            return subprocess.check_call(cmd, shell=True, stdout=stdout, stderr=stderr)
+        else:
+            p = subprocess.Popen(cmd, shell=True, stdout=stdout, stderr=stderr)
+            return p.wait()
    else:
-        p = subprocess.Popen(cmd, shell=shell, stdout=stdout, stderr=stderr)
-        return p.wait()
+        if exception:
+            return subprocess.check_call(shlex.split(cmd), stdout=stdout, stderr=stderr)
+        else:
+            p = subprocess.Popen(shlex.split(cmd), stdout=stdout, stderr=stderr)
+            return p.wait()

 def out(cmd, shell=False, exception=True):
-    if not shell:
-        cmd = shlex.split(cmd)
-    if exception:
-        return subprocess.check_output(cmd, shell=shell).strip().decode('utf-8')
+    if shell:
+        if exception:
+            return subprocess.check_output(cmd, shell=True).strip().decode('utf-8')
+        else:
+            p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+            return p.communicate()[0].strip().decode('utf-8')
    else:
-        p = subprocess.Popen(cmd, shell=shell, stdout=subprocess.PIPE)
-        return p.communicate()[0].strip().decode('utf-8')
+        if exception:
+            return subprocess.check_output(shlex.split(cmd)).strip().decode('utf-8')
+        else:
+            p = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE)
+            return p.communicate()[0].strip().decode('utf-8')

 def is_debian_variant():
    return os.path.exists('/etc/debian_version')
@@ -339,59 +320,30 @@ def dist_ver():
    return platform.dist()[1]

 def is_unused_disk(dev):
-    # dev is not in /sys/class/block/, like /dev/nvme[0-9]+
+    # dev is not in /sys/class/block/
    if not os.path.isdir('/sys/class/block/{dev}'.format(dev=dev.replace('/dev/',''))):
        return False
-    try:
-        fd = os.open(dev, os.O_EXCL)
-        os.close(fd)
-        return True
-    except OSError:
+    # dev is mounted
+    with open('/proc/mounts') as f:
+        s = f.read().strip()
+    if len(re.findall('^{} '.format(dev), s, flags=re.MULTILINE)) > 0:
        return False
-
-CONCOLORS = {'green':'\033[1;32m', 'red':'\033[1;31m', 'nocolor':'\033[0m'}
-def colorprint(msg, **kwargs):
-    fmt = dict(CONCOLORS)
-    fmt.update(kwargs)
-    print(msg.format(**fmt))
-
-def get_mode_cpuset(nic, mode):
-    try:
-        mode_cpu_mask=out('/usr/lib/scylla/perftune.py --tune net --nic "{nic}" --mode "{mode}" --get-cpu-mask'.format(nic=nic, mode=mode))
-        return hex2list(mode_cpu_mask)
-    except subprocess.CalledProcessError:
-        return '-1'
-
-def get_cur_cpuset():
-    cfg = sysconfig_parser('/etc/scylla.d/cpuset.conf')
-    cpuset=cfg.get('CPUSET')
-    return re.sub(r'^--cpuset (.+)$', r'\1', cpuset).strip()
-
-def get_tune_mode(nic):
-    if not os.path.exists('/etc/scylla.d/cpuset.conf'):
-        return
-    cur_cpuset=get_cur_cpuset()
-    mq_cpuset=get_mode_cpuset(nic, 'mq')
-    sq_cpuset=get_mode_cpuset(nic, 'sq')
-    sq_split_cpuset=get_mode_cpuset(nic, 'sq_split')
-
-    if cur_cpuset == mq_cpuset:
-        return 'mq'
-    elif cur_cpuset == sq_cpuset:
-        return 'sq'
-    elif cur_cpuset == sq_split_cpuset:
-        return 'sq_split'
-
-def create_perftune_conf(nic='eth0'):
-    if os.path.exists('/etc/scylla.d/perftune.yaml'):
-        return
-    mode=get_tune_mode(nic)
-    yaml=out('/usr/lib/scylla/perftune.py --tune net --nic "{nic}" --mode {mode} --dump-options-file'.format(nic=nic, mode=mode))
-    with open('/etc/scylla.d/perftune.yaml', 'w') as f:
-        f.write(yaml)
-
-def is_valid_nic(nic):
-    return os.path.exists('/sys/class/net/{}'.format(nic))
+    # dev is used in LVM
+    if shutil.which('pvs'):
+        s = out('pvs -o pv_name --nohead')
+        if len(re.findall(dev, s, flags=re.MULTILINE)) > 0:
+            return False
+    # dev is used for swap
+    s = out('swapon --show=NAME --noheadings')
+    if len(re.findall(dev, s, flags=re.MULTILINE)) > 0:
+        return False
+    # dev is used in MDRAID
+    if os.path.exists('/proc/mdstat'):
+        with open('/proc/mdstat') as f:
+            s = f.read().strip()
+        if len(re.findall(dev, s, flags=re.MULTILINE)) > 0:
+            return False
+    return True

 class SystemdException(Exception):
    pass
@@ -421,7 +373,8 @@ class systemd_unit:
        return run('systemctl disable {}'.format(self._unit))

    def is_active(self):
-        return out('systemctl is-active {}'.format(self._unit), exception=False)
+        res = out('systemctl is-active {}'.format(self._unit), exception=False)
+        return True if re.match(r'^active', res, flags=re.MULTILINE) else False

    def mask(self):
        return run('systemctl mask {}'.format(self._unit))
@@ -452,7 +405,7 @@ class sysconfig_parser:
        self.__load()

    def get(self, key):
-        return self._cfg.get('global', key).strip('"')
+        return self._cfg.get('global', key)

    def set(self, key, val):
        if not self._cfg.has_option('global', key):
@@ -463,3 +416,9 @@ class sysconfig_parser:
    def commit(self):
        with open(self._filename, 'w') as f:
            f.write(self._data)
+
+class concolor:
+    GREEN = '\033[0;32m'
+    RED = '\033[0;31m'
+    BOLD_RED = '\033[1;31m'
+    NO_COLOR = '\033[0m'
--- a/dist/common/systemd/scylla-housekeeping-daily.service.mustache
+++ b/dist/common/systemd/scylla-housekeeping-daily.service.mustache
@@ -10,7 +10,7 @@ Group=scylla
 ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/apt/sources.list.d/scylla*.list' version --mode d
 {{/debian}}
 {{#redhat}}
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/yum.repos.d/scylla*.repo' version --mode d
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode d
 {{/redhat}}

 [Install]
--- a/dist/common/systemd/scylla-housekeeping-restart.service.mustache
+++ b/dist/common/systemd/scylla-housekeeping-restart.service.mustache
@@ -6,7 +6,7 @@ After=network.target
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/yum.repos.d/scylla*.repo' version --mode r
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode r

 [Install]
 WantedBy=multi-user.target
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -116,9 +116,6 @@ if [ ! -f /usr/bin/pystache ]; then
        sudo apt-get install -y python-pystache
    fi
 fi
-if is_debian_variant && [ ! -f /usr/share/doc/python-pkg-resources/copyright ]; then
-    sudo apt-get install -y python-pkg-resources
-fi

 if [ -z "$TARGET" ]; then
    if is_debian_variant; then
@@ -160,8 +157,8 @@ chmod a+rx debian/rules

 if [ "$TARGET" != "trusty" ]; then
    pystache dist/common/systemd/scylla-server.service.mustache "{ $MUSTACHE_DIST }" > debian/scylla-server.service
-    pystache dist/common/systemd/scylla-housekeeping-daily.service.mustache "{ $MUSTACHE_DIST }" > debian/scylla-server.scylla-housekeeping-daily.service
-    pystache dist/common/systemd/scylla-housekeeping-restart.service.mustache "{ $MUSTACHE_DIST }" > debian/scylla-server.scylla-housekeeping-restart.service
+    pystache dist/common/systemd/scylla-housekeeping-daily.service.mustache "{ $MUSTACHE_DIST }" > debian/scylla-housekeeping-daily.service
+    pystache dist/common/systemd/scylla-housekeeping-restart.service.mustache "{ $MUSTACHE_DIST }" > debian/scylla-housekeeping-restart.service
    cp dist/common/systemd/scylla-fstrim.service debian/scylla-server.scylla-fstrim.service
    cp dist/common/systemd/node-exporter.service debian/scylla-server.node-exporter.service
 fi
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -26,14 +26,14 @@ ADD commandlineparser.py /commandlineparser.py
 ADD docker-entrypoint.py /docker-entrypoint.py

 # Install Scylla:
-RUN curl http://downloads.scylladb.com/rpm/centos/scylla-2.3.repo -o /etc/yum.repos.d/scylla.repo && \
+RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo && \
    yum -y install epel-release && \
    yum -y clean expire-cache && \
    yum -y update && \
    yum -y remove boost-thread boost-system && \
    yum -y install scylla hostname supervisor && \
    yum clean all && \
-    yum -y install python36 python36-PyYAML && \
+    yum -y install python34 python34-PyYAML && \
    cat /scylla_bashrc >> /etc/bashrc && \
    mkdir -p /etc/supervisor.conf.d && \
    mkdir -p /var/log/scylla && \
--- a/dist/docker/redhat/scylla-service.sh
+++ b/dist/docker/redhat/scylla-service.sh
@@ -1,13 +1,7 @@
 #!/bin/bash

-/usr/lib/scylla/scylla_prepare
-
-. /etc/sysconfig/scylla-server
+. /usr/lib/scylla/scylla_prepare

 export SCYLLA_HOME SCYLLA_CONF

-for f in /etc/scylla.d/*.conf; do
-    . "$f"
-done
-
 exec /usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET $SCYLLA_DOCKER_ARGS
--- a/dist/redhat/build_rpm.sh
+++ b/dist/redhat/build_rpm.sh
@@ -98,19 +98,12 @@ rm -f version

 pystache dist/redhat/scylla.spec.mustache "{ \"version\": \"$SCYLLA_VERSION\", \"release\": \"$SCYLLA_RELEASE\", \"housekeeping\": $DIST }" > build/scylla.spec

-# mock generates files owned by root, fix this up
-fix_ownership() {
-    sudo chown "$(id -u):$(id -g)" -R "$@"
-}
-
 if [ $JOBS -gt 0 ]; then
    RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
 fi
 sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
-fix_ownership build/srpms
 if [[ "$TARGET" =~ ^epel-7- ]]; then
    TARGET=scylla-$TARGET
    RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
 fi
 sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/scylla-$VERSION*.src.rpm
-fix_ownership build/rpms
--- a/dist/redhat/scylla.spec.mustache
+++ b/dist/redhat/scylla.spec.mustache
@@ -56,9 +56,9 @@ License:        AGPLv3
 URL:            http://www.scylladb.com/
 BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler systemtap-sdt-devel ninja-build cmake python ragel grep kernel-headers
 %{?fedora:BuildRequires: boost-devel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum python2-pystache}
-%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python36 scylla-gcc73-c++, scylla-python36-pyparsing20 yaml-cpp-static pystache python-setuptools}
+%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python34 scylla-gcc73-c++, scylla-python34-pyparsing20 yaml-cpp-static pystache python-setuptools}
 Requires:       scylla-conf systemd-libs hwloc PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils python3-pyudev mdadm xfsprogs
-%{?rhel:Requires: python36 python36-PyYAML kernel >= 3.10.0-514}
+%{?rhel:Requires: python34 python34-PyYAML kernel >= 3.10.0-514}
 %{?fedora:Requires: python3 python3-PyYAML}
 Conflicts:      abrt
 %ifarch x86_64
@@ -97,7 +97,7 @@ cflags="--cflags=${defines[*]}"
 %endif
 %if 0%{?rhel}
 . /etc/profile.d/scylla.sh
-python3.6 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --python python3.6 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
+python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
 %endif
 ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune

@@ -201,6 +201,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_prefix}/lib/scylla/api/api-doc/*
 %{_prefix}/lib/scylla/scyllatop/*
 %{_prefix}/lib/scylla/scylla_config_get.py
+%{_prefix}/lib/scylla/scylla_lib.sh
 %{_prefix}/lib/scylla/scylla_util.py
 %if 0%{?fedora} >= 27
 %{_prefix}/lib/scylla/scylla-gdb.py
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -449,13 +449,9 @@ GCC6_CONCEPT(requires requires(StopCondition stop, ConsumeMutationFragment consu
    { consume_mf(std::move(mf)) } -> void;
    { consume_eos() } -> future<>;
 })
-future<> consume_mutation_fragments_until(
-        flat_mutation_reader& r,
-        StopCondition&& stop,
-        ConsumeMutationFragment&& consume_mf,
-        ConsumeEndOfStream&& consume_eos,
-        db::timeout_clock::time_point timeout) {
-    return do_until([stop] { return stop(); }, [&r, stop, consume_mf, consume_eos, timeout] {
+future<> consume_mutation_fragments_until(flat_mutation_reader& r, StopCondition&& stop,
+                                          ConsumeMutationFragment&& consume_mf, ConsumeEndOfStream&& consume_eos) {
+    return do_until([stop] { return stop(); }, [&r, stop, consume_mf, consume_eos] {
        while (!r.is_buffer_empty()) {
            consume_mf(r.pop_mutation_fragment());
            if (stop()) {
@@ -465,7 +461,7 @@ future<> consume_mutation_fragments_until(
        if (r.is_end_of_stream()) {
            return consume_eos();
        }
-        return r.fill_buffer(timeout);
+        return r.fill_buffer();
    });
 }

--- a/gms/endpoint_state.hh
+++ b/gms/endpoint_state.hh
@@ -129,8 +129,26 @@ public:
        update_is_normal();
    }

-    void add_application_state(const endpoint_state& es) {
-        _application_state = es._application_state;
+    void apply_application_state(application_state key, versioned_value&& value) {
+        auto&& e = _application_state[key];
+        if (e.version < value.version) {
+            e = std::move(value);
+        }
+        update_is_normal();
+    }
+
+    void apply_application_state(application_state key, const versioned_value& value) {
+        auto&& e = _application_state[key];
+        if (e.version < value.version) {
+            e = value;
+        }
+        update_is_normal();
+    }
+
+    void apply_application_state(const endpoint_state& es) {
+        for (auto&& e : es._application_state) {
+            apply_application_state(e.first, e.second);
+        }
        update_is_normal();
    }

--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -923,7 +923,7 @@ void gossiper::make_random_gossip_digest(std::vector<gossip_digest>& g_digests)
 future<> gossiper::replicate(inet_address ep, const endpoint_state& es) {
    return container().invoke_on_all([ep, es, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].add_application_state(es);
+            g.endpoint_state_map[ep].apply_application_state(es);
        }
    });
 }
@@ -932,7 +932,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
    return container().invoke_on_all([ep, &src, &changed, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
            for (auto&& key : changed) {
-                g.endpoint_state_map[ep].add_application_state(key, src.at(key));
+                g.endpoint_state_map[ep].apply_application_state(key, src.at(key));
            }
        }
    });
@@ -941,7 +941,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
 future<> gossiper::replicate(inet_address ep, application_state key, const versioned_value& value) {
    return container().invoke_on_all([ep, key, &value, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].add_application_state(key, value);
+            g.endpoint_state_map[ep].apply_application_state(key, value);
        }
    });
 }
@@ -1168,13 +1168,11 @@ stdx::optional<endpoint_state> gossiper::get_endpoint_state_for_endpoint(inet_ad
    }
 }

-future<> gossiper::reset_endpoint_state_map() {
+void gossiper::reset_endpoint_state_map() {
+    endpoint_state_map.clear();
    _unreachable_endpoints.clear();
    _live_endpoints.clear();
    _live_endpoints_just_added.clear();
-    return container().invoke_on_all([] (gossiper& g) {
-        g.endpoint_state_map.clear();
-    });
 }

 std::unordered_map<inet_address, endpoint_state>& gms::gossiper::get_endpoint_states() {
@@ -1657,7 +1655,6 @@ void gossiper::maybe_initialize_local_state(int generation_nbr) {
    }
 }

-// Runs inside seastar::async context
 void gossiper::add_saved_endpoint(inet_address ep) {
    if (ep == get_broadcast_address()) {
        logger.debug("Attempt to add self as saved endpoint");
@@ -1683,7 +1680,6 @@ void gossiper::add_saved_endpoint(inet_address ep) {
    }
    ep_state.mark_dead();
    endpoint_state_map[ep] = ep_state;
-    replicate(ep, ep_state).get();
    _unreachable_endpoints[ep] = now();
    logger.trace("Adding saved endpoint {} {}", ep, ep_state.get_heart_beat_state().get_generation());
 }
@@ -1919,7 +1915,6 @@ void gossiper::mark_as_shutdown(const inet_address& endpoint) {
        auto& ep_state = *es;
        ep_state.add_application_state(application_state::STATUS, storage_service_value_factory().shutdown(true));
        ep_state.get_heart_beat_state().force_highest_possible_version_unsafe();
-        replicate(endpoint, ep_state).get();
        mark_dead(endpoint, ep_state);
        get_local_failure_detector().force_conviction(endpoint);
    }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -418,7 +418,7 @@ public:
    stdx::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep) const;

    // removes ALL endpoint states; should only be called after shadow gossip
-    future<> reset_endpoint_state_map();
+    void reset_endpoint_state_map();

    std::unordered_map<inet_address, endpoint_state>& get_endpoint_states();

--- a/imr/alloc.hh
+++ b/imr/alloc.hh
@@ -84,8 +84,6 @@ template<typename Structure, typename CtxFactory>
 GCC6_CONCEPT(requires ContextFactory<CtxFactory>)
 class lsa_migrate_fn final : public migrate_fn_type, CtxFactory {
 public:
-    using structure = Structure;
-
    explicit lsa_migrate_fn(CtxFactory context_factory)
        : migrate_fn_type(1)
        , CtxFactory(std::move(context_factory))
@@ -203,21 +201,8 @@ public:
        /// arguments are passed to `T::size_when_serialized`.
        ///
        /// \return null pointer of type `uint8_t*`.
-        template<typename T, typename MigrateFn, typename... Args>
-        uint8_t* allocate(MigrateFn* migrate_fn, Args&&... args) noexcept {
-            static_assert(std::is_same_v<typename MigrateFn::structure, T>);
-            return do_allocate<T>(migrate_fn, std::forward<Args>(args)...);
-        }
-
-        template<typename T, typename MigrateFn, typename... Args>
-        auto allocate_nested(MigrateFn* migrate_fn, Args&&... args) noexcept {
-            static_assert(std::is_same_v<typename MigrateFn::structure, T>);
-            return do_allocate_nested<T>(migrate_fn, std::forward<Args>(args)...);
-        }
-
-    private:
        template<typename T, typename... Args>
-        uint8_t* do_allocate(migrate_fn_type* migrate_fn, Args&&... args) noexcept {
+        uint8_t* allocate(migrate_fn_type* migrate_fn, Args&& ... args) noexcept {
            auto size = T::size_when_serialized(std::forward<Args>(args)...);
            _parent.request(size, migrate_fn);

@@ -231,7 +216,7 @@ public:
        }

        template<typename T, typename... Args>
-        auto do_allocate_nested(migrate_fn_type* migrate_fn, Args&& ... args) noexcept {
+        auto allocate_nested(migrate_fn_type* migrate_fn, Args&& ... args) noexcept {
            auto n = _parent.request(0, migrate_fn);
            return T::get_sizer(continuation(_parent, n),
                                std::forward<Args>(args)...);
@@ -259,28 +244,15 @@ public:
        /// to the buffer requested in the sizing phase. Arguments are passed
        /// to `T::serialize`.
        /// \return pointer to the IMR object
-        template<typename T, typename MigrateFn, typename... Args>
-        uint8_t* allocate(MigrateFn* migrate_fn, Args&&... args) noexcept {
-            static_assert(std::is_same_v<typename MigrateFn::structure, T>);
-            return do_allocate<T>(migrate_fn, std::forward<Args>(args)...);
-        }
-
-        template<typename T, typename MigrateFn, typename... Args>
-        auto allocate_nested(MigrateFn* migrate_fn, Args&&... args) noexcept {
-            static_assert(std::is_same_v<typename MigrateFn::structure, T>);
-            return do_allocate_nested<T>(migrate_fn, std::forward<Args>(args)...);
-        }
-
-    private:
        template<typename T, typename... Args>
-        uint8_t* do_allocate(migrate_fn_type* migrate_fn, Args&&... args) noexcept {
+        uint8_t* allocate(migrate_fn_type* migrate_fn, Args&& ... args) noexcept {
            auto ptr = _parent.next_object();
            T::serialize(ptr, std::forward<Args>(args)...);
            return ptr;
        }

        template<typename T, typename... Args>
-        auto do_allocate_nested(migrate_fn_type*, Args&& ... args) noexcept {
+        auto allocate_nested(migrate_fn_type*, Args&& ... args) noexcept {
            auto ptr = _parent.next_object();
            return T::get_serializer(ptr,
                                     continuation(ptr),
--- a/imr/utils.hh
+++ b/imr/utils.hh
@@ -61,12 +61,8 @@ private:
 public:
    object_context(const uint8_t*, State... state) : _state { state... } { }
    template<typename Tag, typename... Args>
-    auto context_for(const uint8_t* ptr, Args&&... args) const noexcept {
-        if constexpr (std::is_same_v<Tag, basic_object::tags::back_pointer>) {
-            return no_context_t();
-        } else {
-            return create(ptr, std::index_sequence_for<State...>());
-        }
+    Context context_for(const uint8_t* ptr, Args&&... args) const noexcept {
+        return create(ptr, std::index_sequence_for<State...>());
    }
 };

@@ -162,22 +158,13 @@ public:
    }

    /// Create an IMR objects
-    template<typename Writer, typename MigrateFn>
-    GCC6_CONCEPT(requires WriterAllocator<Writer, Structure>)
-    static object make(Writer&& object_writer,
-                       MigrateFn* migrate = &imr::alloc::default_lsa_migrate_fn<structure>::migrate_fn) {
-        static_assert(std::is_same_v<typename MigrateFn::structure, structure>);
-        return do_make(std::forward<Writer>(object_writer), migrate);
-    }
-private:
    template<typename Writer>
    GCC6_CONCEPT(requires WriterAllocator<Writer, Structure>)
-    static object do_make(Writer&& object_writer, allocation_strategy::migrate_fn migrate) {
+    static object make(Writer&& object_writer,
+                       allocation_strategy::migrate_fn migrate = &imr::alloc::default_lsa_migrate_fn<structure>::migrate_fn) {
        struct alloc_deleter {
-            size_t _size;
-
            void operator()(uint8_t* ptr) {
-                current_allocator().free(ptr, _size);
+                current_allocator().free(ptr);
            }
        };
        using alloc_unique_ptr = std::unique_ptr<uint8_t[], alloc_deleter>;
@@ -189,7 +176,7 @@ private:
        auto& alloc = current_allocator();
        alloc::object_allocator allocator(alloc);
        auto obj_size = structure::size_when_serialized(writer, allocator.get_sizer());
-        auto ptr = alloc_unique_ptr(static_cast<uint8_t*>(alloc.alloc(migrate, obj_size, 1)), alloc_deleter { obj_size });
+        auto ptr = alloc_unique_ptr(static_cast<uint8_t*>(alloc.alloc(migrate, obj_size, 1)));
        allocator.allocate_all();
        structure::serialize(ptr.get(), writer, allocator.get_serializer());
        return object(ptr.release());
--- a/install-dependencies.sh
+++ b/install-dependencies.sh
@@ -42,5 +42,5 @@ elif [ "$ID" = "fedora" ]; then
    yum install -y yaml-cpp-devel thrift-devel antlr3-tool antlr3-C++-devel jsoncpp-devel snappy-devel
 elif [ "$ID" = "centos" ]; then
    yum install -y yaml-cpp-devel thrift-devel scylla-antlr35-tool scylla-antlr35-C++-devel jsoncpp-devel snappy-devel scylla-boost163-static scylla-python34-pyparsing20 systemd-devel
-    echo -e "Configure example:\n\tpython3.6 ./configure.py --enable-dpdk --mode=release --static-boost --compiler=/opt/scylladb/bin/g++-7.3 --python python3.6 --ldflag=-Wl,-rpath=/opt/scylladb/lib64 --cflags=-I/opt/scylladb/include --with-antlr3=/opt/scylladb/bin/antlr3"
+    echo -e "Configure example:\n\tpython3.4 ./configure.py --enable-dpdk --mode=release --static-boost --compiler=/opt/scylladb/bin/g++-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64 --cflags=-I/opt/scylladb/include --with-antlr3=/opt/scylladb/bin/antlr3"
 fi
--- a/json.cc
+++ b/json.cc
@@ -1,80 +0,0 @@
-/*
- * Copyright (C) 2018 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "json.hh"
-
-namespace seastar {
-namespace json {
-
-static inline bool is_control_char(char c) {
-    return c >= 0 && c <= 0x1F;
-}
-
-static inline bool needs_escaping(const sstring& s) {
-    return std::any_of(s.begin(), s.end(), [](char c) {return is_control_char(c) || c == '"' || c == '\\';});
-}
-
-sstring value_to_quoted_string(const sstring& value) {
-    if (!needs_escaping(value)) {
-        return sprint("\"%s\"", value);
-    }
-    std::ostringstream oss;
-    oss << std::hex << std::uppercase << std::setfill('0');
-    oss.put('"');
-    for (char c : value) {
-        switch (c) {
-        case '"':
-            oss.put('\\').put('"');
-            break;
-        case '\\':
-            oss.put('\\').put('\\');
-            break;
-        case '\b':
-            oss.put('\\').put('b');
-            break;
-        case '\f':
-            oss.put('\\').put('f');
-            break;
-        case '\n':
-            oss.put('\\').put('n');
-            break;
-        case '\r':
-            oss.put('\\').put('r');
-            break;
-        case '\t':
-            oss.put('\\').put('t');
-            break;
-        default:
-            if (is_control_char(c)) {
-                oss.put('\\').put('u') << std::setw(4) << static_cast<int>(c);
-            } else {
-                oss.put(c);
-            }
-            break;
-        }
-    }
-    oss.put('"');
-    return oss.str();
-}
-
-}
-
-}
--- a/json.hh
+++ b/json.hh
@@ -95,8 +95,6 @@ inline std::map<sstring, sstring> to_map(const sstring& raw) {
    return to_map(raw, std::map<sstring, sstring>());
 }

-sstring value_to_quoted_string(const sstring& value);
-
 }

 }
--- a/keys.cc
+++ b/keys.cc
@@ -113,4 +113,4 @@ int32_t weight(bound_kind k) {
    abort();
 }

-const thread_local clustering_key_prefix bound_view::empty_prefix = clustering_key::make_empty();
+const thread_local clustering_key_prefix bound_view::_empty_prefix = clustering_key::make_empty();
--- a/keys.hh
+++ b/keys.hh
@@ -748,10 +748,6 @@ public:
    static const compound& get_compound_type(const schema& s) {
        return s.clustering_key_prefix_type();
    }
-
-    static clustering_key_prefix_view make_empty() {
-        return { bytes_view() };
-    }
 };

 class clustering_key_prefix : public prefix_compound_wrapper<clustering_key_prefix, clustering_key_prefix_view, clustering_key> {
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -119,17 +119,9 @@ insert_token_range_to_sorted_container_while_unwrapping(
        const dht::token& tok,
        dht::token_range_vector& ret) {
    if (prev_tok < tok) {
-        auto pos = ret.end();
-        if (!ret.empty() && !std::prev(pos)->end()) {
-            // We inserted a wrapped range (a, b] previously as
-            // (-inf, b], (a, +inf). So now we insert in the next-to-last
-            // position to keep the last range (a, +inf) at the end.
-            pos = std::prev(pos);
-        }
-        ret.insert(pos,
-                dht::token_range{
-                        dht::token_range::bound(prev_tok, false),
-                        dht::token_range::bound(tok, true)});
+        ret.emplace_back(
+                dht::token_range::bound(prev_tok, false),
+                dht::token_range::bound(tok, true));
    } else {
        ret.emplace_back(
                dht::token_range::bound(prev_tok, false),
--- a/main.cc
+++ b/main.cc
@@ -389,7 +389,13 @@ int main(int ac, char** av) {
            sstring broadcast_address = cfg->broadcast_address();
            sstring broadcast_rpc_address = cfg->broadcast_rpc_address();
            stdx::optional<std::vector<sstring>> hinted_handoff_enabled = cfg->experimental() ? parse_hinted_handoff_enabled(cfg->hinted_handoff_enabled()) : stdx::nullopt;
-            auto prom_addr = seastar::net::dns::get_host_by_name(cfg->prometheus_address()).get0();
+            auto prom_addr = [&] {
+                try {
+                    return seastar::net::dns::get_host_by_name(cfg->prometheus_address()).get0();
+                } catch (...) {
+                    std::throw_with_nested(std::runtime_error(fmt::format("Unable to resolve prometheus_address {}", cfg->prometheus_address())));
+                }
+            }();
            supervisor::notify("starting prometheus API server");
            uint16_t pport = cfg->prometheus_port();
            if (pport) {
@@ -467,7 +473,13 @@ int main(int ac, char** av) {
            // #293 - do not stop anything
            // engine().at_exit([] { return i_endpoint_snitch::stop_snitch(); });
            supervisor::notify("determining DNS name");
-            auto e = seastar::net::dns::get_host_by_name(api_address).get0();
+            auto e = [&] {
+                try {
+                    return seastar::net::dns::get_host_by_name(api_address).get0();
+                } catch (...) {
+                    std::throw_with_nested(std::runtime_error(fmt::format("Unable to resolve api_address {}", api_address)));
+                }
+            }();
            supervisor::notify("starting API server");
            auto ip = e.addr_list.front();
            ctx.http_server.start("API").get();
@@ -490,6 +502,7 @@ int main(int ac, char** av) {
                }
            };
            dbcfg.compaction_scheduling_group = make_sched_group("compaction", 1000);
+            dbcfg.memory_compaction_scheduling_group = make_sched_group("mem_compaction", 1000);
            dbcfg.streaming_scheduling_group = make_sched_group("streaming", 200);
            dbcfg.statement_scheduling_group = make_sched_group("statement", 1000);
            dbcfg.memtable_scheduling_group = make_sched_group("memtable", 1000);
@@ -763,11 +776,8 @@ int main(int ac, char** av) {
                return service::get_local_storage_service().drain_on_shutdown();
            });

-            engine().at_exit([cfg] {
-                if (cfg->view_building()) {
-                    return view_builder.stop();
-                }
-                return make_ready_future<>();
+            engine().at_exit([] {
+                return view_builder.stop();
            });

            engine().at_exit([&db] {
--- a/memtable.cc
+++ b/memtable.cc
@@ -27,11 +27,11 @@
 #include "schema_upgrader.hh"
 #include "partition_builder.hh"

-memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, memtable_list* memtable_list)
+memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, memtable_list* memtable_list,
+    seastar::scheduling_group compaction_scheduling_group)
        : logalloc::region(dmm.region_group())
        , _dirty_mgr(dmm)
-        , _memtable_cleaner(*this, no_cache_tracker)
-        , _cleaner(&_memtable_cleaner)
+        , _cleaner(*this, no_cache_tracker, compaction_scheduling_group)
        , _memtable_list(memtable_list)
        , _schema(std::move(schema))
        , partitions(memtable_entry::compare(_schema)) {
@@ -56,10 +56,9 @@ void memtable::clear() noexcept {
    auto dirty_before = dirty_size();
    with_allocator(allocator(), [this] {
        partitions.clear_and_dispose([this] (memtable_entry* e) {
-            e->partition().evict(_memtable_cleaner);
+            e->partition().evict(_cleaner);
            current_deleter<memtable_entry>()(e);
        });
-        _memtable_cleaner.clear();
    });
    remove_flushed_memory(dirty_before - dirty_size());
 }
@@ -322,7 +321,7 @@ public:
                    _delegate = delegate_reader(*_delegate_range, _slice, _pc, streamed_mutation::forwarding::no, _fwd_mr);
                } else {
                    auto key_and_snp = read_section()(region(), [&] {
-                        return with_linearized_managed_bytes([&] () -> std::optional<std::pair<dht::decorated_key, lw_shared_ptr<partition_snapshot>>> {
+                        return with_linearized_managed_bytes([&] () -> std::optional<std::pair<dht::decorated_key, partition_snapshot_ptr>> {
                            memtable_entry *e = fetch_entry();
                            if (!e) {
                                return { };
@@ -484,7 +483,7 @@ private:
    void get_next_partition() {
        uint64_t component_size = 0;
        auto key_and_snp = read_section()(region(), [&] {
-            return with_linearized_managed_bytes([&] () -> std::optional<std::pair<dht::decorated_key, lw_shared_ptr<partition_snapshot>>> {
+            return with_linearized_managed_bytes([&] () -> std::optional<std::pair<dht::decorated_key, partition_snapshot_ptr>> {
                memtable_entry* e = fetch_entry();
                if (e) {
                    auto dk = e->key();
@@ -550,7 +549,7 @@ public:
    }
 };

-lw_shared_ptr<partition_snapshot> memtable_entry::snapshot(memtable& mtbl) {
+partition_snapshot_ptr memtable_entry::snapshot(memtable& mtbl) {
    return _pe.read(mtbl.region(), mtbl.cleaner(), _schema, no_cache_tracker);
 }

@@ -564,7 +563,7 @@ memtable::make_flat_reader(schema_ptr s,
                      mutation_reader::forwarding fwd_mr) {
    if (query::is_single_partition(range)) {
        const query::ring_position& pos = range.start()->value();
-        auto snp = _read_section(*this, [&] () -> lw_shared_ptr<partition_snapshot> {
+        auto snp = _read_section(*this, [&] () -> partition_snapshot_ptr {
            managed_bytes::linearization_context_guard lcg;
            auto i = partitions.find(pos, memtable_entry::compare(_schema));
            if (i != partitions.end()) {
--- a/memtable.hh
+++ b/memtable.hh
@@ -66,7 +66,7 @@ public:
    partition_entry& partition() { return _pe; }
    const schema_ptr& schema() const { return _schema; }
    schema_ptr& schema() { return _schema; }
-    lw_shared_ptr<partition_snapshot> snapshot(memtable& mtbl);
+    partition_snapshot_ptr snapshot(memtable& mtbl);

    size_t external_memory_usage_without_rows() const {
        return _key.key().external_memory_usage();
@@ -125,8 +125,7 @@ public:
        bi::compare<memtable_entry::compare>>;
 private:
    dirty_memory_manager& _dirty_mgr;
-    mutation_cleaner _memtable_cleaner;
-    mutation_cleaner* _cleaner; // will switch to cache's cleaner after memtable is moved to cache.
+    mutation_cleaner _cleaner;
    memtable_list *_memtable_list;
    schema_ptr _schema;
    logalloc::allocating_section _read_section;
@@ -254,7 +253,8 @@ private:
    void clear() noexcept;
    uint64_t dirty_size() const;
 public:
-    explicit memtable(schema_ptr schema, dirty_memory_manager&, memtable_list *memtable_list = nullptr);
+    explicit memtable(schema_ptr schema, dirty_memory_manager&, memtable_list *memtable_list = nullptr,
+        seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group());
    // Used for testing that want to control the flush process.
    explicit memtable(schema_ptr schema);
    ~memtable();
@@ -294,7 +294,7 @@ public:
    }

    mutation_cleaner& cleaner() {
-        return *_cleaner;
+        return _cleaner;
    }
 public:
    memtable_list* get_memtable_list() {
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -262,12 +262,11 @@ void messaging_service::start_listen() {
    // FIXME: we don't set so.tcp_nodelay, because we can't tell at this point whether the connection will come from a
    //        local or remote datacenter, and whether or not the connection will be used for gossip. We can fix
    //        the first by wrapping its server_socket, but not the second.
-    auto limits = rpc_resource_limits(_mcfg.rpc_memory_limit);
    if (!_server[0]) {
        auto listen = [&] (const gms::inet_address& a) {
            auto addr = ipv4_addr{a.raw_addr(), _port};
            return std::unique_ptr<rpc_protocol_server_wrapper>(new rpc_protocol_server_wrapper(*_rpc,
-                    so, addr, limits));
+                    so, addr, rpc_resource_limits(_mcfg.rpc_memory_limit)));
        };
        _server[0] = listen(_listen_address);
        if (listen_to_bc) {
@@ -278,7 +277,7 @@ void messaging_service::start_listen() {
    if (!_server_tls[0]) {
        auto listen = [&] (const gms::inet_address& a) {
            return std::unique_ptr<rpc_protocol_server_wrapper>(
-                    [this, &so, &a, limits] () -> std::unique_ptr<rpc_protocol_server_wrapper>{
+                    [this, &so, &a] () -> std::unique_ptr<rpc_protocol_server_wrapper>{
                if (_encrypt_what == encrypt_what::none) {
                    return nullptr;
                }
@@ -286,7 +285,7 @@ void messaging_service::start_listen() {
                lo.reuse_address = true;
                auto addr = make_ipv4_address(ipv4_addr{a.raw_addr(), _ssl_port});
                return std::make_unique<rpc_protocol_server_wrapper>(*_rpc,
-                        so, seastar::tls::listen(_credentials, addr, lo), limits);
+                        so, seastar::tls::listen(_credentials, addr, lo));
            }());
        };
        _server_tls[0] = listen(_listen_address);
--- a/multishard_writer.cc
+++ b/multishard_writer.cc
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "multishard_writer.hh"
+#include "mutation_reader.hh"
+#include "mutation_fragment.hh"
+#include "schema_registry.hh"
+#include <vector>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/queue.hh>
+
+class queue_reader final : public flat_mutation_reader::impl {
+    seastar::queue<mutation_fragment_opt>& _mq;
+public:
+    queue_reader(schema_ptr s, seastar::queue<mutation_fragment_opt>& mq)
+        : impl(std::move(s))
+        , _mq(mq) {
+    }
+    virtual future<> fill_buffer(db::timeout_clock::time_point) override {
+        return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
+            return _mq.pop_eventually().then([this] (mutation_fragment_opt mopt) {
+                if (!mopt) {
+                    _end_of_stream = true;
+                } else {
+                    push_mutation_fragment(std::move(*mopt));
+                }
+            });
+        });
+    }
+    virtual void next_partition() override {
+        throw std::bad_function_call();
+    }
+    virtual future<> fast_forward_to(const dht::partition_range&, db::timeout_clock::time_point) override {
+        throw std::bad_function_call();
+    }
+    virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point) override {
+        throw std::bad_function_call();
+    }
+};
+
+class shard_writer {
+private:
+    schema_ptr _s;
+    flat_mutation_reader _reader;
+    std::function<future<> (flat_mutation_reader reader)> _consumer;
+public:
+    shard_writer(schema_ptr s,
+        flat_mutation_reader reader,
+        std::function<future<> (flat_mutation_reader reader)> consumer);
+    future<> consume();
+};
+
+// The multishard_writer class gets mutation_fragments generated from
+// flat_mutation_reader and consumes the mutation_fragments with
+// multishard_writer::_consumer. If the mutation_fragment does not belong to
+// the shard multishard_writer is on, it will forward the mutation_fragment to
+// the correct shard. Future returned by multishard_writer() becomes
+// ready when all the mutation_fragments are consumed.
+class multishard_writer {
+private:
+    schema_ptr _s;
+    dht::i_partitioner& _partitioner;
+    std::vector<foreign_ptr<std::unique_ptr<shard_writer>>> _shard_writers;
+    std::vector<future<>> _pending_consumers;
+    std::vector<seastar::queue<mutation_fragment_opt>> _queues;
+    unsigned _current_shard = -1;
+    uint64_t _consumed_partitions = 0;
+    flat_mutation_reader _producer;
+    std::function<future<> (flat_mutation_reader)> _consumer;
+private:
+    unsigned shard_for_mf(const mutation_fragment& mf) {
+        return _partitioner.shard_of(mf.as_partition_start().key().token());
+    }
+    future<> make_shard_writer(unsigned shard);
+    future<stop_iteration> handle_mutation_fragment(mutation_fragment mf);
+    future<stop_iteration> handle_end_of_stream();
+    future<> consume(unsigned shard);
+    future<> wait_pending_consumers();
+    future<> distribute_mutation_fragments();
+public:
+    multishard_writer(
+        schema_ptr s,
+        dht::i_partitioner& partitioner,
+        flat_mutation_reader producer,
+        std::function<future<> (flat_mutation_reader)> consumer);
+    future<uint64_t> operator()();
+};
+
+shard_writer::shard_writer(schema_ptr s,
+    flat_mutation_reader reader,
+    std::function<future<> (flat_mutation_reader reader)> consumer)
+    : _s(s)
+    , _reader(std::move(reader))
+    , _consumer(std::move(consumer)) {
+}
+
+future<> shard_writer::consume() {
+    return _reader.peek().then([this] (mutation_fragment* mf_ptr) {
+        if (mf_ptr) {
+            return _consumer(std::move(_reader));
+        }
+        return make_ready_future<>();
+    });
+}
+
+multishard_writer::multishard_writer(
+    schema_ptr s,
+    dht::i_partitioner& partitioner,
+    flat_mutation_reader producer,
+    std::function<future<> (flat_mutation_reader)> consumer)
+    : _s(std::move(s))
+    , _partitioner(partitioner)
+    , _producer(std::move(producer))
+    , _consumer(std::move(consumer)) {
+    _shard_writers.resize(_partitioner.shard_count());
+    _queues.reserve(_partitioner.shard_count());
+    for (unsigned shard = 0; shard < _partitioner.shard_count(); shard++) {
+        _queues.push_back(seastar::queue<mutation_fragment_opt>{2});
+    }
+}
+
+future<> multishard_writer::make_shard_writer(unsigned shard) {
+    auto this_shard_reader = make_foreign(std::make_unique<flat_mutation_reader>(make_flat_mutation_reader<queue_reader>(_s, _queues[shard])));
+    return smp::submit_to(shard, [gs = global_schema_ptr(_s),
+            consumer = _consumer,
+            reader = std::move(this_shard_reader)] () mutable {
+        auto this_shard_reader = make_foreign_reader(gs.get(), std::move(reader));
+        return make_foreign(std::make_unique<shard_writer>(gs.get(), std::move(this_shard_reader), consumer));
+    }).then([this, shard] (foreign_ptr<std::unique_ptr<shard_writer>> writer) {
+        _shard_writers[shard] = std::move(writer);
+        _pending_consumers.push_back(consume(shard));
+    });
+}
+
+future<stop_iteration> multishard_writer::handle_mutation_fragment(mutation_fragment mf) {
+    auto f = make_ready_future<>();
+    if (mf.is_partition_start()) {
+        _consumed_partitions++;
+        if (unsigned shard = shard_for_mf(mf); shard != _current_shard) {
+            _current_shard = shard;
+            if (!bool(_shard_writers[shard])) {
+                f = make_shard_writer(shard);
+            }
+        }
+    }
+    return f.then([this, mf = std::move(mf)] () mutable {
+        assert(_current_shard != -1u);
+        return _queues[_current_shard].push_eventually(mutation_fragment_opt(std::move(mf)));
+    }).then([] {
+        return stop_iteration::no;
+    });
+}
+
+future<stop_iteration> multishard_writer::handle_end_of_stream() {
+    return parallel_for_each(boost::irange(0u, _partitioner.shard_count()), [this] (unsigned shard) {
+        if (bool(_shard_writers[shard])) {
+            return _queues[shard].push_eventually(mutation_fragment_opt());
+        } else {
+            return make_ready_future<>();
+        }
+    }).then([] {
+        return stop_iteration::yes;
+    });
+}
+
+future<> multishard_writer::consume(unsigned shard) {
+    return smp::submit_to(shard, [writer = _shard_writers[shard].get()] () mutable {
+        return writer->consume();
+    }).handle_exception([this] (std::exception_ptr ep) {
+        for (auto& q : _queues) {
+            q.abort(ep);
+        }
+        return make_exception_future<>(std::move(ep));
+    });
+}
+
+future<> multishard_writer::wait_pending_consumers() {
+    return seastar::when_all_succeed(_pending_consumers.begin(), _pending_consumers.end());
+}
+
+future<> multishard_writer::distribute_mutation_fragments() {
+    return repeat([this] () mutable {
+        return _producer().then([this] (mutation_fragment_opt mf_opt) mutable {
+            if (mf_opt) {
+                return handle_mutation_fragment(std::move(*mf_opt));
+            } else {
+                return handle_end_of_stream();
+            }
+        });
+    });
+}
+
+future<uint64_t> multishard_writer::operator()() {
+    return distribute_mutation_fragments().finally([this] {
+        return wait_pending_consumers();
+    }).then([this] {
+        return _consumed_partitions;
+    });
+}
+
+future<uint64_t> distribute_reader_and_consume_on_shards(schema_ptr s,
+    dht::i_partitioner& partitioner,
+    flat_mutation_reader producer,
+    std::function<future<> (flat_mutation_reader)> consumer) {
+    return do_with(multishard_writer(std::move(s), partitioner, std::move(producer), std::move(consumer)), [] (multishard_writer& writer) {
+        return writer();
+    });
+}
--- a/multishard_writer.hh
+++ b/multishard_writer.hh
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "schema.hh"
+#include "flat_mutation_reader.hh"
+#include "dht/i_partitioner.hh"
+
+// Helper to use multishard_writer to distribute mutation_fragments from the
+// producer to the correct shard and consume with the consumer.
+// It returns number of partitions consumed.
+future<uint64_t> distribute_reader_and_consume_on_shards(schema_ptr s,
+    dht::i_partitioner& partitioner,
+    flat_mutation_reader producer,
+    std::function<future<> (flat_mutation_reader)> consumer);
+
--- a/mutation_cleaner.hh
+++ b/mutation_cleaner.hh
@@ -26,6 +26,76 @@

 #include "utils/logalloc.hh"

+class mutation_cleaner_impl final {
+    using snapshot_list = boost::intrusive::slist<partition_snapshot,
+        boost::intrusive::member_hook<partition_snapshot, boost::intrusive::slist_member_hook<>, &partition_snapshot::_cleaner_hook>>;
+    struct worker {
+        condition_variable cv;
+        snapshot_list snapshots;
+        logalloc::allocating_section alloc_section;
+        bool done = false; // true means the worker was abandoned and cannot access the mutation_cleaner_impl instance.
+    };
+private:
+    logalloc::region& _region;
+    cache_tracker* _tracker;
+    partition_version_list _versions;
+    lw_shared_ptr<worker> _worker_state;
+    seastar::scheduling_group _scheduling_group;
+private:
+    stop_iteration merge_some(partition_snapshot& snp) noexcept;
+    stop_iteration merge_some() noexcept;
+    void start_worker();
+public:
+    mutation_cleaner_impl(logalloc::region& r, cache_tracker* t, seastar::scheduling_group sg = seastar::current_scheduling_group())
+        : _region(r)
+        , _tracker(t)
+        , _worker_state(make_lw_shared<worker>())
+        , _scheduling_group(sg)
+    {
+        start_worker();
+    }
+    ~mutation_cleaner_impl();
+    stop_iteration clear_gently() noexcept;
+    memory::reclaiming_result clear_some() noexcept;
+    void clear() noexcept;
+    void destroy_later(partition_version& v) noexcept;
+    void destroy_gently(partition_version& v) noexcept;
+    void merge(mutation_cleaner_impl& other) noexcept;
+    bool empty() const noexcept { return _versions.empty(); }
+    future<> drain();
+    void merge_and_destroy(partition_snapshot&) noexcept;
+    void set_scheduling_group(seastar::scheduling_group sg) {
+        _scheduling_group = sg;
+        _worker_state->cv.broadcast();
+    }
+};
+
+inline
+void mutation_cleaner_impl::destroy_later(partition_version& v) noexcept {
+    _versions.push_back(v);
+}
+
+inline
+void mutation_cleaner_impl::destroy_gently(partition_version& v) noexcept {
+    if (v.clear_gently(_tracker) == stop_iteration::no) {
+        destroy_later(v);
+    } else {
+        current_allocator().destroy(&v);
+    }
+}
+
+inline
+void mutation_cleaner_impl::merge_and_destroy(partition_snapshot& ps) noexcept {
+    if (ps.slide_to_oldest() == stop_iteration::yes || merge_some(ps) == stop_iteration::yes) {
+        lw_shared_ptr<partition_snapshot>::dispose(&ps);
+    } else {
+        // The snapshot must not be reachable by partitino_entry::read() after this,
+        // which is ensured by slide_to_oldest() == stop_iteration::no.
+        _worker_state->snapshots.push_front(ps);
+        _worker_state->cv.signal();
+    }
+}
+
 // Container for garbage partition_version objects, used for freeing them incrementally.
 //
 // Mutation cleaner extends the lifetime of mutation_partition without doing
@@ -36,57 +106,71 @@
 // mutation_cleaner should not be thread local objects (or members of thread
 // local objects).
 class mutation_cleaner final {
-    logalloc::region& _region;
-    cache_tracker* _tracker;
-    partition_version_list _versions;
+    lw_shared_ptr<mutation_cleaner_impl> _impl;
 public:
-    mutation_cleaner(logalloc::region& r, cache_tracker* t) : _region(r), _tracker(t) {}
-    ~mutation_cleaner();
+    mutation_cleaner(logalloc::region& r, cache_tracker* t, seastar::scheduling_group sg = seastar::current_scheduling_group())
+        : _impl(make_lw_shared<mutation_cleaner_impl>(r, t, sg)) {
+    }
+
+    void set_scheduling_group(seastar::scheduling_group sg) {
+        _impl->set_scheduling_group(sg);
+    }

    // Frees some of the data. Returns stop_iteration::yes iff all was freed.
    // Must be invoked under owning allocator.
-    stop_iteration clear_gently() noexcept;
+    stop_iteration clear_gently() noexcept {
+        return _impl->clear_gently();
+    }

    // Must be invoked under owning allocator.
-    memory::reclaiming_result clear_some() noexcept;
+    memory::reclaiming_result clear_some() noexcept {
+        return _impl->clear_some();
+    }

    // Must be invoked under owning allocator.
-    void clear() noexcept;
+    void clear() noexcept {
+        _impl->clear();
+    }

    // Enqueues v for destruction.
    // The object must not be part of any list, and must not be accessed externally any more.
    // In particular, it must not be attached, even indirectly, to any snapshot or partition_entry,
    // and must not be evicted from.
    // Must be invoked under owning allocator.
-    void destroy_later(partition_version& v) noexcept;
+    void destroy_later(partition_version& v) noexcept {
+        return _impl->destroy_later(v);
+    }

    // Destroys v now or later.
    // Same requirements as destroy_later().
    // Must be invoked under owning allocator.
-    void destroy_gently(partition_version& v) noexcept;
+    void destroy_gently(partition_version& v) noexcept {
+        return _impl->destroy_gently(v);
+    }

    // Transfers objects from other to this.
    // This and other must belong to the same logalloc::region, and the same cache_tracker.
-    // After the call bool(other) is false.
-    void merge(mutation_cleaner& other) noexcept;
+    // After the call other will refer to this cleaner.
+    void merge(mutation_cleaner& other) noexcept {
+        _impl->merge(*other._impl);
+        other._impl = _impl;
+    }

    // Returns true iff contains no unfreed objects
-    bool empty() const noexcept { return _versions.empty(); }
+    bool empty() const noexcept {
+        return _impl->empty();
+    }

    // Forces cleaning and returns a future which resolves when there is nothing to clean.
-    future<> drain();
-};
-
-inline
-void mutation_cleaner::destroy_later(partition_version& v) noexcept {
-    _versions.push_back(v);
-}
-
-inline
-void mutation_cleaner::destroy_gently(partition_version& v) noexcept {
-    if (v.clear_gently(_tracker) == stop_iteration::no) {
-        destroy_later(v);
-    } else {
-        current_allocator().destroy(&v);
+    future<> drain() {
+        return _impl->drain();
    }
-}
+
+    // Will merge given snapshot using partition_snapshot::merge_partition_versions() and then destroys it
+    // using destroy_from_this(), possibly deferring in between.
+    // This instance becomes the sole owner of the partition_snapshot object, the caller should not destroy it
+    // nor access it after calling this.
+    void merge_and_destroy(partition_snapshot& ps) {
+        return _impl->merge_and_destroy(ps);
+    }
+};
--- a/mutation_fragment.hh
+++ b/mutation_fragment.hh
@@ -125,7 +125,7 @@ public:
        return _ck.equal(s, other._ck)
               && _t == other._t
               && _marker == other._marker
-               && _cells.equal(column_kind::regular_column, s, other._cells, s);
+               && _cells.equal(column_kind::static_column, s, other._cells, s);
    }

    friend std::ostream& operator<<(std::ostream& os, const clustering_row& row);
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -144,14 +144,7 @@ mutation_partition::mutation_partition(const schema& s, const mutation_partition
        , _static_row(s, column_kind::static_column, x._static_row)
        , _static_row_continuous(x._static_row_continuous)
        , _rows()
-        , _row_tombstones(x._row_tombstones)
-#ifdef SEASTAR_DEBUG
-        , _schema_version(s.version())
-#endif
-{
-#ifdef SEASTAR_DEBUG
-    assert(x._schema_version == _schema_version);
-#endif
+        , _row_tombstones(x._row_tombstones) {
    auto cloner = [&s] (const auto& x) {
        return current_allocator().construct<rows_entry>(s, x);
    };
@@ -164,14 +157,7 @@ mutation_partition::mutation_partition(const mutation_partition& x, const schema
        , _static_row(schema, column_kind::static_column, x._static_row)
        , _static_row_continuous(x._static_row_continuous)
        , _rows()
-        , _row_tombstones(x._row_tombstones, range_tombstone_list::copy_comparator_only())
-#ifdef SEASTAR_DEBUG
-        , _schema_version(schema.version())
-#endif
-{
-#ifdef SEASTAR_DEBUG
-    assert(x._schema_version == _schema_version);
-#endif
+        , _row_tombstones(x._row_tombstones, range_tombstone_list::copy_comparator_only()) {
    try {
        for(auto&& r : ck_ranges) {
            for (const rows_entry& e : x.range(schema, r)) {
@@ -194,13 +180,7 @@ mutation_partition::mutation_partition(mutation_partition&& x, const schema& sch
    , _static_row_continuous(x._static_row_continuous)
    , _rows(std::move(x._rows))
    , _row_tombstones(std::move(x._row_tombstones))
-#ifdef SEASTAR_DEBUG
-    , _schema_version(schema.version())
-#endif
 {
-#ifdef SEASTAR_DEBUG
-    assert(x._schema_version == _schema_version);
-#endif
    {
        auto deleter = current_deleter<rows_entry>();
        auto it = _rows.begin();
@@ -240,7 +220,6 @@ mutation_partition::operator=(mutation_partition&& x) noexcept {
 }

 void mutation_partition::ensure_last_dummy(const schema& s) {
-    check_schema(s);
    if (_rows.empty() || !_rows.rbegin()->is_last_dummy()) {
        _rows.insert_before(_rows.end(),
            *current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::yes));
@@ -297,21 +276,19 @@ void deletable_row::apply(const schema& s, clustering_row cr) {

 void
 mutation_partition::apply(const schema& s, const mutation_fragment& mf) {
-    check_schema(s);
    mutation_fragment_applier applier{s, *this};
    mf.visit(applier);
 }

-void mutation_partition::apply_monotonically(const schema& s, mutation_partition&& p, cache_tracker* tracker) {
-#ifdef SEASTAR_DEBUG
-    assert(s.version() == _schema_version);
-    assert(p._schema_version == _schema_version);
-#endif
+stop_iteration mutation_partition::apply_monotonically(const schema& s, mutation_partition&& p, cache_tracker* tracker, is_preemptible preemptible) {
    _tombstone.apply(p._tombstone);
-    _row_tombstones.apply_monotonically(s, std::move(p._row_tombstones));
    _static_row.apply_monotonically(s, column_kind::static_column, std::move(p._static_row));
    _static_row_continuous |= p._static_row_continuous;

+    if (_row_tombstones.apply_monotonically(s, std::move(p._row_tombstones), preemptible) == stop_iteration::no) {
+        return stop_iteration::no;
+    }
+
    rows_entry::compare less(s);
    auto del = current_deleter<rows_entry>();
    auto p_i = p._rows.begin();
@@ -343,22 +320,34 @@ void mutation_partition::apply_monotonically(const schema& s, mutation_partition
                // Newer evictable versions store complete rows
                i->_row = std::move(src_e._row);
            } else {
+                memory::on_alloc_point();
                i->_row.apply_monotonically(s, std::move(src_e._row));
            }
            i->set_continuous(continuous);
            i->set_dummy(dummy);
            p_i = p._rows.erase_and_dispose(p_i, del);
        }
+        if (preemptible && need_preempt() && p_i != p._rows.end()) {
+            // We cannot leave p with the clustering range up to p_i->position()
+            // marked as continuous because some of its sub-ranges may have originally been discontinuous.
+            // This would result in the sum of this and p to have broader continuity after preemption,
+            // also possibly violating the invariant of non-overlapping continuity between MVCC versions,
+            // if that's what we're merging here.
+            // It's always safe to mark the range as discontinuous.
+            p_i->set_continuous(false);
+            return stop_iteration::no;
+        }
    }
+    return stop_iteration::yes;
 }

-void mutation_partition::apply_monotonically(const schema& s, mutation_partition&& p, const schema& p_schema) {
+stop_iteration mutation_partition::apply_monotonically(const schema& s, mutation_partition&& p, const schema& p_schema, is_preemptible preemptible) {
    if (s.version() == p_schema.version()) {
-        apply_monotonically(s, std::move(p), no_cache_tracker);
+        return apply_monotonically(s, std::move(p), no_cache_tracker, preemptible);
    } else {
        mutation_partition p2(s, p);
        p2.upgrade(p_schema, s);
-        apply_monotonically(s, std::move(p2), no_cache_tracker);
+        return apply_monotonically(s, std::move(p2), no_cache_tracker, is_preemptible::no); // FIXME: make preemptible
    }
 }

@@ -382,7 +371,6 @@ void mutation_partition::apply_weak(const schema& s, mutation_partition&& p) {

 tombstone
 mutation_partition::range_tombstone_for_row(const schema& schema, const clustering_key& key) const {
-    check_schema(schema);
    tombstone t = _tombstone;
    if (!_row_tombstones.empty()) {
        auto found = _row_tombstones.search_tombstone_covering(schema, key);
@@ -393,7 +381,6 @@ mutation_partition::range_tombstone_for_row(const schema& schema, const clusteri

 row_tombstone
 mutation_partition::tombstone_for_row(const schema& schema, const clustering_key& key) const {
-    check_schema(schema);
    row_tombstone t = row_tombstone(range_tombstone_for_row(schema, key));

    auto j = _rows.find(key, rows_entry::compare(schema));
@@ -406,7 +393,6 @@ mutation_partition::tombstone_for_row(const schema& schema, const clustering_key

 row_tombstone
 mutation_partition::tombstone_for_row(const schema& schema, const rows_entry& e) const {
-    check_schema(schema);
    row_tombstone t = e.row().deleted_at();
    t.apply(range_tombstone_for_row(schema, e.key()));
    return t;
@@ -414,7 +400,6 @@ mutation_partition::tombstone_for_row(const schema& schema, const rows_entry& e)

 void
 mutation_partition::apply_row_tombstone(const schema& schema, clustering_key_prefix prefix, tombstone t) {
-    check_schema(schema);
    assert(!prefix.is_full(schema));
    auto start = prefix;
    _row_tombstones.apply(schema, {std::move(start), std::move(prefix), std::move(t)});
@@ -422,13 +407,11 @@ mutation_partition::apply_row_tombstone(const schema& schema, clustering_key_pre

 void
 mutation_partition::apply_row_tombstone(const schema& schema, range_tombstone rt) {
-    check_schema(schema);
    _row_tombstones.apply(schema, std::move(rt));
 }

 void
 mutation_partition::apply_delete(const schema& schema, const clustering_key_prefix& prefix, tombstone t) {
-    check_schema(schema);
    if (prefix.is_empty(schema)) {
        apply(t);
    } else if (prefix.is_full(schema)) {
@@ -440,7 +423,6 @@ mutation_partition::apply_delete(const schema& schema, const clustering_key_pref

 void
 mutation_partition::apply_delete(const schema& schema, range_tombstone rt) {
-    check_schema(schema);
    if (range_tombstone::is_single_clustering_row_tombstone(schema, rt.start, rt.start_kind, rt.end, rt.end_kind)) {
        apply_delete(schema, std::move(rt.start), std::move(rt.tomb));
        return;
@@ -450,7 +432,6 @@ mutation_partition::apply_delete(const schema& schema, range_tombstone rt) {

 void
 mutation_partition::apply_delete(const schema& schema, clustering_key&& prefix, tombstone t) {
-    check_schema(schema);
    if (prefix.is_empty(schema)) {
        apply(t);
    } else if (prefix.is_full(schema)) {
@@ -462,7 +443,6 @@ mutation_partition::apply_delete(const schema& schema, clustering_key&& prefix,

 void
 mutation_partition::apply_delete(const schema& schema, clustering_key_prefix_view prefix, tombstone t) {
-    check_schema(schema);
    if (prefix.is_empty(schema)) {
        apply(t);
    } else if (prefix.is_full(schema)) {
@@ -486,14 +466,12 @@ void mutation_partition::insert_row(const schema& s, const clustering_key& key,
 }

 void mutation_partition::insert_row(const schema& s, const clustering_key& key, const deletable_row& row) {
-    check_schema(s);
    auto e = current_allocator().construct<rows_entry>(s, key, row);
    _rows.insert(_rows.end(), *e, rows_entry::compare(s));
 }

 const row*
 mutation_partition::find_row(const schema& s, const clustering_key& key) const {
-    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        return nullptr;
@@ -503,7 +481,6 @@ mutation_partition::find_row(const schema& s, const clustering_key& key) const {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, clustering_key&& key) {
-    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = current_allocator().construct<rows_entry>(std::move(key));
@@ -515,7 +492,6 @@ mutation_partition::clustered_row(const schema& s, clustering_key&& key) {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, const clustering_key& key) {
-    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = current_allocator().construct<rows_entry>(key);
@@ -527,7 +503,6 @@ mutation_partition::clustered_row(const schema& s, const clustering_key& key) {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, clustering_key_view key) {
-    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = current_allocator().construct<rows_entry>(key);
@@ -539,7 +514,6 @@ mutation_partition::clustered_row(const schema& s, clustering_key_view key) {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
-    check_schema(s);
    auto i = _rows.find(pos, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = current_allocator().construct<rows_entry>(s, pos, dummy, continuous);
@@ -551,7 +525,6 @@ mutation_partition::clustered_row(const schema& s, position_in_partition_view po

 mutation_partition::rows_type::const_iterator
 mutation_partition::lower_bound(const schema& schema, const query::clustering_range& r) const {
-    check_schema(schema);
    if (!r.start()) {
        return std::cbegin(_rows);
    }
@@ -560,7 +533,6 @@ mutation_partition::lower_bound(const schema& schema, const query::clustering_ra

 mutation_partition::rows_type::const_iterator
 mutation_partition::upper_bound(const schema& schema, const query::clustering_range& r) const {
-    check_schema(schema);
    if (!r.end()) {
        return std::cend(_rows);
    }
@@ -569,7 +541,6 @@ mutation_partition::upper_bound(const schema& schema, const query::clustering_ra

 boost::iterator_range<mutation_partition::rows_type::const_iterator>
 mutation_partition::range(const schema& schema, const query::clustering_range& r) const {
-    check_schema(schema);
    return boost::make_iterator_range(lower_bound(schema, r), upper_bound(schema, r));
 }

@@ -606,7 +577,6 @@ mutation_partition::upper_bound(const schema& schema, const query::clustering_ra
 template<typename Func>
 void mutation_partition::for_each_row(const schema& schema, const query::clustering_range& row_range, bool reversed, Func&& func) const
 {
-    check_schema(schema);
    auto r = range(schema, row_range);
    if (!reversed) {
        for (const auto& e : r) {
@@ -823,7 +793,6 @@ bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tomb

 void
 mutation_partition::query_compacted(query::result::partition_writer& pw, const schema& s, uint32_t limit) const {
-    check_schema(s);
    const query::partition_slice& slice = pw.slice();
    max_timestamp max_ts{pw.last_modified()};

@@ -1042,10 +1011,6 @@ bool mutation_partition::equal(const schema& s, const mutation_partition& p) con
 }

 bool mutation_partition::equal(const schema& this_schema, const mutation_partition& p, const schema& p_schema) const {
-#ifdef SEASTAR_DEBUG
-    assert(_schema_version == this_schema.version());
-    assert(p._schema_version == p_schema.version());
-#endif
    if (_tombstone != p._tombstone) {
        return false;
    }
@@ -1145,7 +1110,7 @@ row::apply_monotonically(const column_definition& column, atomic_cell_or_collect
    if (_type == storage_type::vector && id < max_vector_size) {
        if (id >= _storage.vector.v.size()) {
            _storage.vector.v.resize(id);
-            _storage.vector.v.emplace_back(std::move(value), std::move(hash));
+            _storage.vector.v.emplace_back(cell_and_hash{std::move(value), std::move(hash)});
            _storage.vector.present.set(id);
            _size++;
        } else if (auto& cell_and_hash = _storage.vector.v[id]; !bool(cell_and_hash.cell)) {
@@ -1174,7 +1139,6 @@ row::apply_monotonically(const column_definition& column, atomic_cell_or_collect
 void
 row::append_cell(column_id id, atomic_cell_or_collection value) {
    if (_type == storage_type::vector && id < max_vector_size) {
-        assert(_storage.vector.v.size() <= id);
        _storage.vector.v.resize(id);
        _storage.vector.v.emplace_back(cell_and_hash{std::move(value), cell_hash_opt()});
        _storage.vector.present.set(id);
@@ -1213,7 +1177,7 @@ row::find_cell(column_id id) const {
 size_t row::external_memory_usage(const schema& s, column_kind kind) const {
    size_t mem = 0;
    if (_type == storage_type::vector) {
-        mem += _storage.vector.v.used_space_external_memory_usage();
+        mem += _storage.vector.v.external_memory_usage();
        column_id id = 0;
        for (auto&& c_a_h : _storage.vector.v) {
            auto& cdef = s.column_at(kind, id++);
@@ -1239,7 +1203,6 @@ size_t rows_entry::memory_usage(const schema& s) const {
 }

 size_t mutation_partition::external_memory_usage(const schema& s) const {
-    check_schema(s);
    size_t sum = 0;
    sum += static_row().external_memory_usage(s, column_kind::static_column);
    for (auto& clr : clustered_rows()) {
@@ -1258,7 +1221,6 @@ void mutation_partition::trim_rows(const schema& s,
    const std::vector<query::clustering_range>& row_ranges,
    Func&& func)
 {
-    check_schema(s);
    static_assert(std::is_same<stop_iteration, std::result_of_t<Func(rows_entry&)>>::value, "Bad func signature");

    stop_iteration stop = stop_iteration::no;
@@ -1303,7 +1265,6 @@ uint32_t mutation_partition::do_compact(const schema& s,
    uint32_t row_limit,
    can_gc_fn& can_gc)
 {
-    check_schema(s);
    assert(row_limit > 0);

    auto gc_before = saturating_subtract(query_time, s.gc_grace_seconds());
@@ -1369,14 +1330,12 @@ mutation_partition::compact_for_query(
    bool reverse,
    uint32_t row_limit)
 {
-    check_schema(s);
    return do_compact(s, query_time, row_ranges, reverse, row_limit, always_gc);
 }

 void mutation_partition::compact_for_compaction(const schema& s,
    can_gc_fn& can_gc, gc_clock::time_point compaction_time)
 {
-    check_schema(s);
    static const std::vector<query::clustering_range> all_rows = {
        query::clustering_range::make_open_ended_both_sides()
    };
@@ -1410,13 +1369,11 @@ row::is_live(const schema& s, column_kind kind, tombstone base_tombstone, gc_clo

 bool
 mutation_partition::is_static_row_live(const schema& s, gc_clock::time_point query_time) const {
-    check_schema(s);
    return has_any_live_data(s, column_kind::static_column, static_row(), _tombstone, query_time);
 }

 size_t
 mutation_partition::live_row_count(const schema& s, gc_clock::time_point query_time) const {
-    check_schema(s);
    size_t count = 0;

    for (const rows_entry& e : non_dummy_rows()) {
@@ -1454,17 +1411,12 @@ row::row(const schema& s, column_kind kind, const row& o)
    if (_type == storage_type::vector) {
        auto& other_vec = o._storage.vector;
        auto& vec = *new (&_storage.vector) vector_storage;
-        try {
-            vec.present = other_vec.present;
-            vec.v.reserve(other_vec.v.size());
-            column_id id = 0;
-            for (auto& cell : other_vec.v) {
-                auto& cdef = s.column_at(kind, id++);
-                vec.v.emplace_back(cell_and_hash{cell.cell.copy(*cdef.type), cell.hash});
-            }
-        } catch (...) {
-            _storage.vector.~vector_storage();
-            throw;
+        vec.present = other_vec.present;
+        vec.v.reserve(other_vec.v.size());
+        column_id id = 0;
+        for (auto& cell : other_vec.v) {
+            auto& cdef = s.column_at(kind, id++);
+            vec.v.emplace_back(cell_and_hash { cell.cell.copy(*cdef.type), cell.hash });
        }
    } else {
        auto cloner = [&] (const auto& x) {
@@ -1762,7 +1714,6 @@ row row::difference(const schema& s, column_kind kind, const row& other) const

 mutation_partition mutation_partition::difference(schema_ptr s, const mutation_partition& other) const
 {
-    check_schema(*s);
    mutation_partition mp(s);
    if (_tombstone > other._tombstone) {
        mp.apply(_tombstone);
@@ -1793,7 +1744,6 @@ mutation_partition mutation_partition::difference(schema_ptr s, const mutation_p
 }

 void mutation_partition::accept(const schema& s, mutation_partition_visitor& v) const {
-    check_schema(s);
    v.accept_partition_tombstone(_tombstone);
    _static_row.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
        const column_definition& def = s.static_column_at(id);
@@ -1877,10 +1827,9 @@ void mutation_querier::query_static_row(const row& r, tombstone current_tombston
        } else if (_short_reads_allowed) {
            seastar::measuring_output_stream stream;
            ser::qr_partition__static_row__cells<seastar::measuring_output_stream> out(stream, { });
-            auto start = stream.size();
            get_compacted_row_slice(_schema, slice, column_kind::static_column,
-                                    r, slice.static_columns, out);
-            _memory_accounter.update(stream.size() - start);
+                                    r, slice.static_columns, _static_cells_wr);
+            _memory_accounter.update(stream.size());
        }
        if (_pw.requested_digest()) {
            max_timestamp max_ts{_pw.last_modified()};
@@ -1941,9 +1890,8 @@ stop_iteration mutation_querier::consume(clustering_row&& cr, row_tombstone curr
    } else if (_short_reads_allowed) {
        seastar::measuring_output_stream stream;
        ser::qr_partition__rows<seastar::measuring_output_stream> out(stream, { });
-        auto start = stream.size();
        write_row(out);
-        stop = _memory_accounter.update_and_check(stream.size() - start);
+        stop = _memory_accounter.update_and_check(stream.size());
    }

    _live_clustering_rows++;
@@ -2227,9 +2175,6 @@ mutation_partition::mutation_partition(mutation_partition::incomplete_tag, const
    , _static_row_continuous(!s.has_static_columns())
    , _rows()
    , _row_tombstones(s)
-#ifdef SEASTAR_DEBUG
-    , _schema_version(s.version())
-#endif
 {
    _rows.insert_before(_rows.end(),
        *current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::no));
@@ -2261,7 +2206,6 @@ void mutation_partition::make_fully_continuous() {
 }

 clustering_interval_set mutation_partition::get_continuity(const schema& s, is_continuous cont) const {
-    check_schema(s);
    clustering_interval_set result;
    auto i = _rows.begin();
    auto prev_pos = position_in_partition::before_all_clustered_rows();
@@ -2311,7 +2255,6 @@ stop_iteration mutation_partition::clear_gently(cache_tracker* tracker) noexcept

 bool
 mutation_partition::check_continuity(const schema& s, const position_range& r, is_continuous cont) const {
-    check_schema(s);
    auto less = rows_entry::compare(s);
    auto i = _rows.lower_bound(r.start(), less);
    auto end = _rows.lower_bound(r.end(), less);
@@ -2377,17 +2320,20 @@ future<mutation_opt> counter_write_query(schema_ptr s, const mutation_source& so
    return f.finally([r_a_r = std::move(r_a_r)] { });
 }

-mutation_cleaner::~mutation_cleaner() {
+mutation_cleaner_impl::~mutation_cleaner_impl() {
+    _worker_state->done = true;
+    _worker_state->cv.signal();
+    _worker_state->snapshots.clear_and_dispose(typename lw_shared_ptr<partition_snapshot>::disposer());
    with_allocator(_region.allocator(), [this] {
        clear();
    });
 }

-void mutation_cleaner::clear() noexcept {
+void mutation_cleaner_impl::clear() noexcept {
    while (clear_gently() == stop_iteration::no) ;
 }

-stop_iteration mutation_cleaner::clear_gently() noexcept {
+stop_iteration mutation_cleaner_impl::clear_gently() noexcept {
    while (clear_some() == memory::reclaiming_result::reclaimed_something) {
        if (need_preempt()) {
            return stop_iteration::no;
@@ -2396,7 +2342,7 @@ stop_iteration mutation_cleaner::clear_gently() noexcept {
    return stop_iteration::yes;
 }

-memory::reclaiming_result mutation_cleaner::clear_some() noexcept {
+memory::reclaiming_result mutation_cleaner_impl::clear_some() noexcept {
    if (_versions.empty()) {
        return memory::reclaiming_result::reclaimed_nothing;
    }
@@ -2409,14 +2355,81 @@ memory::reclaiming_result mutation_cleaner::clear_some() noexcept {
    return memory::reclaiming_result::reclaimed_something;
 }

-void mutation_cleaner::merge(mutation_cleaner& r) noexcept {
+void mutation_cleaner_impl::merge(mutation_cleaner_impl& r) noexcept {
    _versions.splice(r._versions);
+    _worker_state->snapshots.splice(_worker_state->snapshots.end(), r._worker_state->snapshots);
+    if (!_worker_state->snapshots.empty()) {
+        _worker_state->cv.signal();
+    }
 }

-future<> mutation_cleaner::drain() {
-    return repeat([this] {
-        return with_allocator(_region.allocator(), [this] {
-            return clear_gently();
+void mutation_cleaner_impl::start_worker() {
+    auto f = repeat([w = _worker_state, this] () mutable noexcept {
+      if (w->done) {
+          return make_ready_future<stop_iteration>(stop_iteration::yes);
+      }
+      return with_scheduling_group(_scheduling_group, [w, this] {
+        return w->cv.wait([w] {
+            return w->done || !w->snapshots.empty();
+        }).then([this, w] () noexcept {
+            if (w->done) {
+                return stop_iteration::yes;
+            }
+            merge_some();
+            return stop_iteration::no;
+        });
+      });
+    });
+    if (f.failed()) {
+        f.get();
+    }
+}
+
+stop_iteration mutation_cleaner_impl::merge_some(partition_snapshot& snp) noexcept {
+    auto&& region = snp.region();
+    return with_allocator(region.allocator(), [&] {
+        return with_linearized_managed_bytes([&] {
+            // Allocating sections require the region to be reclaimable
+            // which means that they cannot be nested.
+            // It is, however, possible, that if the snapshot is taken
+            // inside an allocating section and then an exception is thrown
+            // this function will be called to clean up even though we
+            // still will be in the context of the allocating section.
+            if (!region.reclaiming_enabled()) {
+                return stop_iteration::no;
+            }
+            try {
+                return _worker_state->alloc_section(region, [&] {
+                    return snp.merge_partition_versions();
+                });
+            } catch (...) {
+                // Merging failed, give up as there is no guarantee of forward progress.
+                return stop_iteration::yes;
+            }
+        });
+    });
+}
+
+stop_iteration mutation_cleaner_impl::merge_some() noexcept {
+    if (_worker_state->snapshots.empty()) {
+        return stop_iteration::yes;
+    }
+    partition_snapshot& snp = _worker_state->snapshots.front();
+    if (merge_some(snp) == stop_iteration::yes) {
+        _worker_state->snapshots.pop_front();
+        lw_shared_ptr<partition_snapshot>::dispose(&snp);
+    }
+    return stop_iteration::no;
+}
+
+future<> mutation_cleaner_impl::drain() {
+    return repeat([this] {
+        return merge_some();
+    }).then([this] {
+        return repeat([this] {
+            return with_allocator(_region.allocator(), [this] {
+                return clear_gently();
+            });
        });
    });
 }
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -46,6 +46,7 @@
 #include "clustering_key_filter.hh"
 #include "intrusive_set_external_comparator.hh"
 #include "utils/with_relational_operators.hh"
+#include "utils/preempt.hh"

 class mutation_fragment;
 class clustering_row;
@@ -74,15 +75,6 @@ using cell_hash_opt = seastar::optimized_optional<cell_hash>;
 struct cell_and_hash {
    atomic_cell_or_collection cell;
    mutable cell_hash_opt hash;
-
-    cell_and_hash() = default;
-    cell_and_hash(cell_and_hash&&) noexcept = default;
-    cell_and_hash& operator=(cell_and_hash&&) noexcept = default;
-
-    cell_and_hash(atomic_cell_or_collection&& cell, cell_hash_opt hash)
-        : cell(std::move(cell))
-        , hash(hash)
-    { }
 };

 //
@@ -905,9 +897,6 @@ private:
    // Contains only strict prefixes so that we don't have to lookup full keys
    // in both _row_tombstones and _rows.
    range_tombstone_list _row_tombstones;
-#ifdef SEASTAR_DEBUG
-    table_schema_version _schema_version;
-#endif

    friend class mutation_partition_applier;
    friend class converting_mutation_partition_applier;
@@ -922,16 +911,10 @@ public:
    mutation_partition(schema_ptr s)
        : _rows()
        , _row_tombstones(*s)
-#ifdef SEASTAR_DEBUG
-        , _schema_version(s->version())
-#endif
    { }
    mutation_partition(mutation_partition& other, copy_comparators_only)
        : _rows()
        , _row_tombstones(other._row_tombstones, range_tombstone_list::copy_comparator_only())
-#ifdef SEASTAR_DEBUG
-        , _schema_version(other._schema_version)
-#endif
    { }
    mutation_partition(mutation_partition&&) = default;
    mutation_partition(const schema& s, const mutation_partition&);
@@ -1005,8 +988,19 @@ public:
    // This instance and p are governed by the same schema.
    //
    // Must be provided with a pointer to the cache_tracker, which owns both this and p.
-    void apply_monotonically(const schema& s, mutation_partition&& p, cache_tracker*);
-    void apply_monotonically(const schema& s, mutation_partition&& p, const schema& p_schema);
+    //
+    // Returns stop_iteration::no if the operation was preempted before finished, and stop_iteration::yes otherwise.
+    // On preemption the sum of this and p stays the same (represents the same set of writes), and the state of this
+    // object contains at least all the writes it contained before the call (monotonicity). It may contain partial writes.
+    // Also, some progress is always guaranteed (liveness).
+    //
+    // The operation can be drien to completion like this:
+    //
+    //   while (apply_monotonically(..., is_preemtable::yes) == stop_iteration::no) { }
+    //
+    // If is_preemptible::no is passed as argument then stop_iteration::no is never returned.
+    stop_iteration apply_monotonically(const schema& s, mutation_partition&& p, cache_tracker*, is_preemptible = is_preemptible::no);
+    stop_iteration apply_monotonically(const schema& s, mutation_partition&& p, const schema& p_schema, is_preemptible = is_preemptible::no);

    // Weak exception guarantees.
    // Assumes this and p are not owned by a cache_tracker.
@@ -1131,12 +1125,6 @@ private:
    template<typename Func>
    void for_each_row(const schema& schema, const query::clustering_range& row_range, bool reversed, Func&& func) const;
    friend class counter_write_query_result_builder;
-
-    void check_schema(const schema& s) const {
-#ifdef SEASTAR_DEBUG
-        assert(s.version() == _schema_version);
-#endif
-    }
 };

 inline
--- a/mutation_partition_view.cc
+++ b/mutation_partition_view.cc
@@ -29,6 +29,8 @@
 #include "mutation_partition.hh"
 #include "counters.hh"
 #include "frozen_mutation.hh"
+#include "partition_builder.hh"
+#include "converting_mutation_partition_applier.hh"

 #include "utils/UUID.hh"
 #include "serializer.hh"
@@ -60,10 +62,10 @@ atomic_cell read_atomic_cell(const abstract_type& type, atomic_cell_variant cv,
        explicit atomic_cell_visitor(const abstract_type& t, atomic_cell::collection_member cm)
            : _type(t), _collection_member(cm) { }
        atomic_cell operator()(ser::live_cell_view& lcv) const {
-            return atomic_cell::make_live(_type, lcv.created_at(), lcv.value(), _collection_member);
+            return atomic_cell::make_live(_type, lcv.created_at(), lcv.value().view(), _collection_member);
        }
        atomic_cell operator()(ser::expiring_cell_view& ecv) const {
-            return atomic_cell::make_live(_type, ecv.c().created_at(), ecv.c().value(), ecv.expiry(), ecv.ttl(), _collection_member);
+            return atomic_cell::make_live(_type, ecv.c().created_at(), ecv.c().value().view(), ecv.expiry(), ecv.ttl(), _collection_member);
        }
        atomic_cell operator()(ser::dead_cell_view& dcv) const {
            return atomic_cell::make_dead(dcv.tomb().timestamp(), dcv.tomb().deletion_time());
@@ -129,20 +131,13 @@ void read_and_visit_row(ser::row_view rv, const column_mapping& cm, column_kind
                : _visitor(v), _id(id), _col(col) { }

            void operator()(atomic_cell_variant& acv) const {
-                if (!_col.type()->is_atomic()) {
+                if (!_col.is_atomic()) {
                    throw std::runtime_error("A collection expected, got an atomic cell");
                }
-                // FIXME: Pass view to cell to avoid copy
-                auto&& outer = current_allocator();
-                with_allocator(standard_allocator(), [&] {
-                    auto cell = read_atomic_cell(*_col.type(), acv);
-                    with_allocator(outer, [&] {
-                        _visitor.accept_atomic_cell(_id, cell);
-                    });
-                });
+                _visitor.accept_atomic_cell(_id, read_atomic_cell(*_col.type(), acv));
            }
            void operator()(ser::collection_cell_view& ccv) const {
-                if (_col.type()->is_atomic()) {
+                if (_col.is_atomic()) {
                    throw std::runtime_error("An atomic cell expected, got a collection");
                }
                // FIXME: Pass view to cell to avoid copy
@@ -187,23 +182,19 @@ row_marker read_row_marker(boost::variant<ser::live_marker_view, ser::expiring_m

 }

-void
-mutation_partition_view::accept(const schema& s, mutation_partition_visitor& visitor) const {
-    accept(s.get_column_mapping(), visitor);
-}
-
-void
-mutation_partition_view::accept(const column_mapping& cm, mutation_partition_visitor& visitor) const {
+template<typename Visitor>
+GCC6_CONCEPT(requires MutationViewVisitor<Visitor>)
+void mutation_partition_view::do_accept(const column_mapping& cm, Visitor& visitor) const {
    auto in = _in;
    auto mpv = ser::deserialize(in, boost::type<ser::mutation_partition_view>());

    visitor.accept_partition_tombstone(mpv.tomb());

    struct static_row_cell_visitor {
-        mutation_partition_visitor& _visitor;
+        Visitor& _visitor;

-        void accept_atomic_cell(column_id id, const atomic_cell& ac) const {
-           _visitor.accept_static_cell(id, ac);
+        void accept_atomic_cell(column_id id, atomic_cell ac) const {
+           _visitor.accept_static_cell(id, std::move(ac));
        }
        void accept_collection(column_id id, const collection_mutation& cm) const {
           _visitor.accept_static_cell(id, cm);
@@ -217,13 +208,13 @@ mutation_partition_view::accept(const column_mapping& cm, mutation_partition_vis

    for (auto&& cr : mpv.rows()) {
        auto t = row_tombstone(cr.deleted_at(), shadowable_tombstone(cr.shadowable_deleted_at()));
-        visitor.accept_row(position_in_partition_view::for_key(cr.key()), t, read_row_marker(cr.marker()));
+        visitor.accept_row(position_in_partition_view::for_key(cr.key()), t, read_row_marker(cr.marker()), is_dummy::no, is_continuous::yes);

        struct cell_visitor {
-            mutation_partition_visitor& _visitor;
+            Visitor& _visitor;

-            void accept_atomic_cell(column_id id, const atomic_cell& ac) const {
-               _visitor.accept_row_cell(id, ac);
+            void accept_atomic_cell(column_id id, atomic_cell ac) const {
+               _visitor.accept_row_cell(id, std::move(ac));
            }
            void accept_collection(column_id id, const collection_mutation& cm) const {
               _visitor.accept_row_cell(id, cm);
@@ -233,6 +224,38 @@ mutation_partition_view::accept(const column_mapping& cm, mutation_partition_vis
    }
 }

+void mutation_partition_view::accept(const schema& s, partition_builder& visitor) const
+{
+    do_accept(s.get_column_mapping(), visitor);
+}
+
+void mutation_partition_view::accept(const column_mapping& cm, converting_mutation_partition_applier& visitor) const
+{
+    do_accept(cm, visitor);
+}
+
+std::optional<clustering_key> mutation_partition_view::first_row_key() const
+{
+    auto in = _in;
+    auto mpv = ser::deserialize(in, boost::type<ser::mutation_partition_view>());
+    auto rows = mpv.rows();
+    if (rows.empty()) {
+        return { };
+    }
+    return rows.front().key();
+}
+
+std::optional<clustering_key> mutation_partition_view::last_row_key() const
+{
+    auto in = _in;
+    auto mpv = ser::deserialize(in, boost::type<ser::mutation_partition_view>());
+    auto rows = mpv.rows();
+    if (rows.empty()) {
+        return { };
+    }
+    return rows.back().key();
+}
+
 mutation_partition_view mutation_partition_view::from_view(ser::mutation_partition_view v)
 {
    return { v.v };
@@ -250,9 +273,8 @@ mutation_fragment frozen_mutation_fragment::unfreeze(const schema& s)
            public:
                clustering_row_builder(const schema& s, clustering_key key, row_tombstone t, row_marker m)
                    : _s(s), _mf(mutation_fragment::clustering_row_tag_t(), std::move(key), std::move(t), std::move(m), row()) { }
-                void accept_atomic_cell(column_id id, const atomic_cell& ac) {
-                    auto& type = *_s.regular_column_at(id).type;
-                    _mf.as_mutable_clustering_row().cells().append_cell(id, atomic_cell_or_collection(atomic_cell(type, ac)));
+                void accept_atomic_cell(column_id id, atomic_cell ac) {
+                    _mf.as_mutable_clustering_row().cells().append_cell(id, atomic_cell_or_collection(std::move(ac)));
                }
                void accept_collection(column_id id, const collection_mutation& cm) {
                    auto& ctype = *static_pointer_cast<const collection_type_impl>(_s.regular_column_at(id).type);
@@ -273,9 +295,8 @@ mutation_fragment frozen_mutation_fragment::unfreeze(const schema& s)
                mutation_fragment _mf;
            public:
                explicit static_row_builder(const schema& s) : _s(s), _mf(static_row()) { }
-                void accept_atomic_cell(column_id id, const atomic_cell& ac) {
-                    auto& type = *_s.static_column_at(id).type;
-                    _mf.as_mutable_static_row().cells().append_cell(id, atomic_cell_or_collection(atomic_cell(type, ac)));
+                void accept_atomic_cell(column_id id, atomic_cell ac) {
+                    _mf.as_mutable_static_row().cells().append_cell(id, atomic_cell_or_collection(std::move(ac)));
                }
                void accept_collection(column_id id, const collection_mutation& cm) {
                    auto& ctype = *static_pointer_cast<const collection_type_impl>(_s.static_column_at(id).type);
--- a/mutation_partition_view.hh
+++ b/mutation_partition_view.hh
@@ -29,6 +29,26 @@ namespace ser {
 class mutation_partition_view;
 }

+class partition_builder;
+class converting_mutation_partition_applier;
+
+GCC6_CONCEPT(
+template<typename T>
+concept bool MutationViewVisitor = requires (T visitor, tombstone t, atomic_cell ac,
+                                             collection_mutation_view cmv, range_tombstone rt,
+                                             position_in_partition_view pipv, row_tombstone row_tomb,
+                                             row_marker rm) {
+    visitor.accept_partition_tombstone(t);
+    visitor.accept_static_cell(column_id(), std::move(ac));
+    visitor.accept_static_cell(column_id(), cmv);
+    visitor.accept_row_tombstone(rt);
+    visitor.accept_row(pipv, row_tomb, rm,
+            is_dummy::no, is_continuous::yes);
+    visitor.accept_row_cell(column_id(), std::move(ac));
+    visitor.accept_row_cell(column_id(), cmv);
+};
+)
+
 // View on serialized mutation partition. See mutation_partition_serializer.
 class mutation_partition_view {
    utils::input_stream _in;
@@ -36,11 +56,18 @@ private:
    mutation_partition_view(utils::input_stream v)
        : _in(v)
    { }
+
+    template<typename Visitor>
+    GCC6_CONCEPT(requires MutationViewVisitor<Visitor>)
+    void do_accept(const column_mapping&, Visitor& visitor) const;
 public:
    static mutation_partition_view from_stream(utils::input_stream v) {
        return { v };
    }
    static mutation_partition_view from_view(ser::mutation_partition_view v);
-    void accept(const schema& schema, mutation_partition_visitor& visitor) const;
-    void accept(const column_mapping&, mutation_partition_visitor& visitor) const;
+    void accept(const schema& schema, partition_builder& visitor) const;
+    void accept(const column_mapping&, converting_mutation_partition_applier& visitor) const;
+
+    std::optional<clustering_key> first_row_key() const;
+    std::optional<clustering_key> last_row_key() const;
 };
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -184,13 +184,11 @@ private:
    // end, a call to next_partition() or a call to
    // fast_forward_to(dht::partition_range).
    reader_and_last_fragment_kind _single_reader;
-    dht::decorated_key_opt _key;
    const schema_ptr _schema;
    streamed_mutation::forwarding _fwd_sm;
    mutation_reader::forwarding _fwd_mr;
 private:
-    const dht::token* current_position() const;
-    void maybe_add_readers(const dht::token* const t);
+    void maybe_add_readers(const std::optional<dht::ring_position_view>& pos);
    void add_readers(std::vector<flat_mutation_reader> new_readers);
    future<> prepare_next();
    // Collect all forwardable readers into _next, and remove them from
@@ -236,7 +234,7 @@ class list_reader_selector : public reader_selector {

 public:
    explicit list_reader_selector(schema_ptr s, std::vector<flat_mutation_reader> readers)
-        : reader_selector(s, dht::ring_position::min())
+        : reader_selector(s, dht::ring_position_view::min())
        , _readers(std::move(readers)) {
    }

@@ -246,8 +244,8 @@ public:
    list_reader_selector(list_reader_selector&&) = default;
    list_reader_selector& operator=(list_reader_selector&&) = default;

-    virtual std::vector<flat_mutation_reader> create_new_readers(const dht::token* const) override {
-        _selector_position = dht::ring_position::max();
+    virtual std::vector<flat_mutation_reader> create_new_readers(const std::optional<dht::ring_position_view>&) override {
+        _selector_position = dht::ring_position_view::max();
        return std::exchange(_readers, {});
    }

@@ -256,12 +254,10 @@ public:
    }
 };

-void mutation_reader_merger::maybe_add_readers(const dht::token* const t) {
-    if (!_selector->has_new_readers(t)) {
-        return;
+void mutation_reader_merger::maybe_add_readers(const std::optional<dht::ring_position_view>& pos) {
+    if (_selector->has_new_readers(pos)) {
+        add_readers(_selector->create_new_readers(pos));
    }
-
-    add_readers(_selector->create_new_readers(t));
 }

 void mutation_reader_merger::add_readers(std::vector<flat_mutation_reader> new_readers) {
@@ -272,14 +268,6 @@ void mutation_reader_merger::add_readers(std::vector<flat_mutation_reader> new_r
    }
 }

-const dht::token* mutation_reader_merger::current_position() const {
-    if (!_key) {
-        return nullptr;
-    }
-
-    return &_key->token();
-}
-
 struct mutation_reader_merger::reader_heap_compare {
    const schema& s;

@@ -338,12 +326,10 @@ future<> mutation_reader_merger::prepare_next() {
        // waiting for a fast-forward so there is nothing to do.
        if (_fragment_heap.empty() && _halted_readers.empty()) {
            if (_reader_heap.empty()) {
-                _key = {};
+                maybe_add_readers(std::nullopt);
            } else {
-                _key = _reader_heap.front().fragment.as_partition_start().key();
+                maybe_add_readers(_reader_heap.front().fragment.as_partition_start().key());
            }
-
-            maybe_add_readers(current_position());
        }
    });
 }
@@ -371,7 +357,7 @@ mutation_reader_merger::mutation_reader_merger(schema_ptr schema,
    , _schema(std::move(schema))
    , _fwd_sm(fwd_sm)
    , _fwd_mr(fwd_mr) {
-    maybe_add_readers(nullptr);
+    maybe_add_readers(std::nullopt);
 }

 future<mutation_reader_merger::mutation_fragment_batch> mutation_reader_merger::operator()() {
--- a/mutation_reader.hh
+++ b/mutation_reader.hh
@@ -50,19 +50,19 @@ namespace mutation_reader {
 class reader_selector {
 protected:
    schema_ptr _s;
-    dht::ring_position _selector_position;
+    dht::ring_position_view _selector_position;
 public:
-    reader_selector(schema_ptr s, dht::ring_position rp) noexcept : _s(std::move(s)), _selector_position(std::move(rp)) {}
+    reader_selector(schema_ptr s, dht::ring_position_view rpv) noexcept : _s(std::move(s)), _selector_position(std::move(rpv)) {}

    virtual ~reader_selector() = default;
    // Call only if has_new_readers() returned true.
-    virtual std::vector<flat_mutation_reader> create_new_readers(const dht::token* const t) = 0;
+    virtual std::vector<flat_mutation_reader> create_new_readers(const std::optional<dht::ring_position_view>& pos) = 0;
    virtual std::vector<flat_mutation_reader> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) = 0;

    // Can be false-positive but never false-negative!
-    bool has_new_readers(const dht::token* const t) const noexcept {
+    bool has_new_readers(const std::optional<dht::ring_position_view>& pos) const noexcept {
        dht::ring_position_comparator cmp(*_s);
-        return !_selector_position.is_max() && (!t || cmp(dht::ring_position_view(*t), _selector_position) >= 0);
+        return !_selector_position.is_max() && (!pos || cmp(*pos, _selector_position) >= 0);
    }
 };

--- a/partition_builder.hh
+++ b/partition_builder.hh
@@ -25,7 +25,7 @@
 #include "mutation_partition_view.hh"

 // Partition visitor which builds mutation_partition corresponding to the data its fed with.
-class partition_builder : public mutation_partition_visitor {
+class partition_builder final : public mutation_partition_visitor {
 private:
    const schema& _schema;
    mutation_partition& _partition;
@@ -43,9 +43,13 @@ public:
    }

    virtual void accept_static_cell(column_id id, atomic_cell_view cell) override {
-        row& r = _partition.static_row();
        auto& cdef = _schema.static_column_at(id);
-        r.append_cell(id, atomic_cell_or_collection(*cdef.type, cell));
+        accept_static_cell(id, atomic_cell(*cdef.type, cell));
+    }
+
+    void accept_static_cell(column_id id, atomic_cell&& cell) {
+        row& r = _partition.static_row();
+        r.append_cell(id, atomic_cell_or_collection(std::move(cell)));
    }

    virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
@@ -66,9 +70,13 @@ public:
    }

    virtual void accept_row_cell(column_id id, atomic_cell_view cell) override {
-        row& r = _current_row->cells();
        auto& cdef = _schema.regular_column_at(id);
-        r.append_cell(id, atomic_cell_or_collection(*cdef.type, cell));
+        accept_row_cell(id, atomic_cell(*cdef.type, cell));
+    }
+
+    void accept_row_cell(column_id id, atomic_cell&& cell) {
+        row& r = _current_row->cells();
+        r.append_cell(id, atomic_cell_or_collection(std::move(cell)));
    }

    virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -33,34 +33,6 @@ struct partition_snapshot_reader_dummy_accounter {
 };
 extern partition_snapshot_reader_dummy_accounter no_accounter;

-inline void maybe_merge_versions(lw_shared_ptr<partition_snapshot>& snp,
-                                 logalloc::region& lsa_region,
-                                 logalloc::allocating_section& read_section) {
-    if (!snp.owned()) {
-        return;
-    }
-    // If no one else is using this particular snapshot try to merge partition
-    // versions.
-    with_allocator(lsa_region.allocator(), [&snp, &lsa_region, &read_section] {
-        return with_linearized_managed_bytes([&snp, &lsa_region, &read_section] {
-            try {
-                // Allocating sections require the region to be reclaimable
-                // which means that they cannot be nested.
-                // It is, however, possible, that if the snapshot is taken
-                // inside an allocating section and then an exception is thrown
-                // this function will be called to clean up even though we
-                // still will be in the context of the allocating section.
-                if (lsa_region.reclaiming_enabled()) {
-                    read_section(lsa_region, [&snp] {
-                        snp->merge_partition_versions();
-                    });
-                }
-            } catch (...) { }
-            snp = {};
-        });
-    });
-}
-
 template <typename MemoryAccounter = partition_snapshot_reader_dummy_accounter>
 class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public MemoryAccounter {
    struct rows_position {
@@ -87,7 +59,7 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
        position_in_partition::equal_compare _eq;
        heap_compare _heap_cmp;

-        lw_shared_ptr<partition_snapshot> _snapshot;
+        partition_snapshot_ptr _snapshot;

        logalloc::region& _region;
        logalloc::allocating_section& _read_section;
@@ -99,7 +71,7 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
    private:
        template<typename Function>
        decltype(auto) in_alloc_section(Function&& fn) {
-            return _read_section.with_reclaiming_disabled(_region, [&] { 
+            return _read_section.with_reclaiming_disabled(_region, [&] {
                return with_linearized_managed_bytes([&] {
                    return fn();
                });
@@ -155,7 +127,7 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
            return !_clustering_rows.empty();
        }
    public:
-        explicit lsa_partition_reader(const schema& s, lw_shared_ptr<partition_snapshot> snp,
+        explicit lsa_partition_reader(const schema& s, partition_snapshot_ptr snp,
                                      logalloc::region& region, logalloc::allocating_section& read_section,
                                      bool digest_requested)
            : _schema(s)
@@ -168,10 +140,6 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
            , _digest_requested(digest_requested)
        { }

-        ~lsa_partition_reader() {
-            maybe_merge_versions(_snapshot, _region, _read_section);
-        }
-
        template<typename Function>
        decltype(auto) with_reserve(Function&& fn) {
            return _read_section.with_reserve(std::forward<Function>(fn));
@@ -187,7 +155,7 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
                return _snapshot->static_row(_digest_requested);
            });
        }
-        
+
        // Returns next clustered row in the range.
        // If the ck_range is the same as the one used previously last_row needs
        // to be engaged and equal the position of the row returned last time.
@@ -298,7 +266,7 @@ private:
    }
 public:
    template <typename... Args>
-    partition_snapshot_flat_reader(schema_ptr s, dht::decorated_key dk, lw_shared_ptr<partition_snapshot> snp,
+    partition_snapshot_flat_reader(schema_ptr s, dht::decorated_key dk, partition_snapshot_ptr snp,
                              query::clustering_key_filter_ranges crr, bool digest_requested,
                              logalloc::region& region, logalloc::allocating_section& read_section,
                              boost::any pointer_to_container, Args&&... args)
@@ -344,7 +312,7 @@ inline flat_mutation_reader
 make_partition_snapshot_flat_reader(schema_ptr s,
                                    dht::decorated_key dk,
                                    query::clustering_key_filter_ranges crr,
-                                    lw_shared_ptr<partition_snapshot> snp,
+                                    partition_snapshot_ptr snp,
                                    bool digest_requested,
                                    logalloc::region& region,
                                    logalloc::allocating_section& read_section,
@@ -365,7 +333,7 @@ inline flat_mutation_reader
 make_partition_snapshot_flat_reader(schema_ptr s,
                                    dht::decorated_key dk,
                                    query::clustering_key_filter_ranges crr,
-                                    lw_shared_ptr<partition_snapshot> snp,
+                                    partition_snapshot_ptr snp,
                                    bool digest_requested,
                                    logalloc::region& region,
                                    logalloc::allocating_section& read_section,
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -187,23 +187,49 @@ void merge_versions(const schema& s, mutation_partition& newer, mutation_partiti
    newer = std::move(older);
 }

-void partition_snapshot::merge_partition_versions() {
+stop_iteration partition_snapshot::merge_partition_versions() {
    partition_version_ref& v = version();
    if (!v.is_unique_owner()) {
-        auto first_used = &*v;
-        _version = { };
-        while (first_used->prev() && !first_used->is_referenced()) {
-            first_used = first_used->prev();
+        // Shift _version to the oldest unreferenced version and then keep merging left hand side into it.
+        // This is good for performance because in case we were at the latest version
+        // we leave it for incoming writes and they don't have to create a new one.
+        partition_version* current = &*v;
+        while (current->next() && !current->next()->is_referenced()) {
+            current = current->next();
+            _version = partition_version_ref(*current);
        }
-
-        auto current = first_used->next();
-        while (current && !current->is_referenced()) {
-            auto next = current->next();
-            merge_versions(*_schema, first_used->partition(), std::move(current->partition()), _tracker);
-            current_allocator().destroy(current);
-            current = next;
+        while (auto prev = current->prev()) {
+            _region.allocator().invalidate_references();
+            if (current->partition().apply_monotonically(*schema(), std::move(prev->partition()), _tracker, is_preemptible::yes) == stop_iteration::no) {
+                return stop_iteration::no;
+            }
+            if (prev->is_referenced()) {
+                _version.release();
+                prev->back_reference() = partition_version_ref(*current, prev->back_reference().is_unique_owner());
+                current_allocator().destroy(prev);
+                return stop_iteration::yes;
+            }
+            current_allocator().destroy(prev);
        }
    }
+    return stop_iteration::yes;
+}
+
+stop_iteration partition_snapshot::slide_to_oldest() noexcept {
+    partition_version_ref& v = version();
+    if (v.is_unique_owner()) {
+        return stop_iteration::yes;
+    }
+    if (_entry) {
+        _entry->_snapshot = nullptr;
+        _entry = nullptr;
+    }
+    partition_version* current = &*v;
+    while (current->next() && !current->next()->is_referenced()) {
+        current = current->next();
+        _version = partition_version_ref(*current);
+    }
+    return current->prev() ? stop_iteration::no : stop_iteration::yes;
 }

 unsigned partition_snapshot::version_count()
@@ -312,7 +338,7 @@ partition_version& partition_entry::add_version(const schema& s, cache_tracker*

 void partition_entry::apply(const schema& s, const mutation_partition& mp, const schema& mp_schema)
 {
-    apply(s, mutation_partition(mp_schema, mp), mp_schema);
+    apply(s, mutation_partition(s, mp), mp_schema);
 }

 void partition_entry::apply(const schema& s, mutation_partition&& mp, const schema& mp_schema)
@@ -463,16 +489,13 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
    bool can_move = !preemptible && !pe._snapshot;

    auto src_snp = pe.read(reg, pe_cleaner, s.shared_from_this(), no_cache_tracker);
-    lw_shared_ptr<partition_snapshot> prev_snp;
+    partition_snapshot_ptr prev_snp;
    if (preemptible) {
        // Reads must see prev_snp until whole update completes so that writes
        // are not partially visible.
        prev_snp = read(reg, tracker.cleaner(), s.shared_from_this(), &tracker, phase - 1);
    }
    auto dst_snp = read(reg, tracker.cleaner(), s.shared_from_this(), &tracker, phase);
-    auto merge_dst_snp = defer([preemptible, dst_snp, &reg, &alloc] () mutable {
-        maybe_merge_versions(dst_snp, reg, alloc);
-    });

    // Once we start updating the partition, we must keep all snapshots until the update completes,
    // otherwise partial writes would be published. So the scope of snapshots must enclose the scope
@@ -480,7 +503,6 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
    // give the caller a chance to store the coroutine object. The code inside coroutine below
    // runs outside allocating section.
    return coroutine([&tracker, &s, &alloc, &reg, &acc, can_move, preemptible,
-            merge_dst_snp = std::move(merge_dst_snp), // needs to go away last so that dst_snp is not owned by anyone else
            cur = partition_snapshot_row_cursor(s, *dst_snp),
            src_cur = partition_snapshot_row_cursor(s, *src_snp, can_move),
            dst_snp = std::move(dst_snp),
@@ -587,7 +609,7 @@ void partition_entry::upgrade(schema_ptr from, schema_ptr to, mutation_cleaner&
    remove_or_mark_as_unique_owner(old_version, &cleaner);
 }

-lw_shared_ptr<partition_snapshot> partition_entry::read(logalloc::region& r,
+partition_snapshot_ptr partition_entry::read(logalloc::region& r,
    mutation_cleaner& cleaner, schema_ptr entry_schema, cache_tracker* tracker, partition_snapshot::phase_type phase)
 {
    if (_snapshot) {
@@ -610,7 +632,7 @@ lw_shared_ptr<partition_snapshot> partition_entry::read(logalloc::region& r,

    auto snp = make_lw_shared<partition_snapshot>(entry_schema, r, cleaner, this, tracker, phase);
    _snapshot = snp.get();
-    return snp;
+    return partition_snapshot_ptr(std::move(snp));
 }

 std::vector<range_tombstone>
@@ -674,3 +696,13 @@ void partition_entry::evict(mutation_cleaner& cleaner) noexcept {
        remove_or_mark_as_unique_owner(v, &cleaner);
    }
 }
+
+partition_snapshot_ptr::~partition_snapshot_ptr() {
+    if (_snp) {
+        auto&& cleaner = _snp->cleaner();
+        auto snp = _snp.release();
+        if (snp) {
+            cleaner.merge_and_destroy(*snp.release());
+        }
+    }
+}
--- a/partition_version.hh
+++ b/partition_version.hh
@@ -28,6 +28,7 @@
 #include "utils/coroutine.hh"

 #include <boost/intrusive/parent_from_member.hpp>
+#include <boost/intrusive/slist.hpp>

 // This is MVCC implementation for mutation_partitions.
 //
@@ -188,8 +189,9 @@ class partition_version_ref {
    friend class partition_version;
 public:
    partition_version_ref() = default;
-    explicit partition_version_ref(partition_version& pv) noexcept
+    explicit partition_version_ref(partition_version& pv, bool unique_owner = false) noexcept
        : _version(&pv)
+        , _unique_owner(unique_owner)
    {
        assert(!_version->_backref);
        _version->_backref = this;
@@ -300,8 +302,9 @@ private:
    logalloc::region& _region;
    mutation_cleaner& _cleaner;
    cache_tracker* _tracker;
-
+    boost::intrusive::slist_member_hook<> _cleaner_hook;
    friend class partition_entry;
+    friend class mutation_cleaner_impl;
 public:
    explicit partition_snapshot(schema_ptr s,
                                logalloc::region& region,
@@ -329,10 +332,17 @@ public:
        return container_of(v._backref);
    }

-    // If possible merges the version pointed to by this snapshot with
+    // If possible, merges the version pointed to by this snapshot with
    // adjacent partition versions. Leaves the snapshot in an unspecified state.
    // Can be retried if previous merge attempt has failed.
-    void merge_partition_versions();
+    stop_iteration merge_partition_versions();
+
+    // Prepares the snapshot for cleaning by moving to the right-most unreferenced version.
+    // Returns stop_iteration::yes if there is nothing to merge with and the snapshot
+    // should be collected right away, and stop_iteration::no otherwise.
+    // When returns stop_iteration::no, the snapshots is guaranteed to not be attached
+    // to the latest version.
+    stop_iteration slide_to_oldest() noexcept;

    ~partition_snapshot();

@@ -357,6 +367,7 @@ public:
    const schema_ptr& schema() const { return _schema; }
    logalloc::region& region() const { return _region; }
    cache_tracker* tracker() const { return _tracker; }
+    mutation_cleaner& cleaner() { return _cleaner; }

    tombstone partition_tombstone() const;
    ::static_row static_row(bool digest_requested) const;
@@ -368,6 +379,36 @@ public:
    std::vector<range_tombstone> range_tombstones();
 };

+class partition_snapshot_ptr {
+    lw_shared_ptr<partition_snapshot> _snp;
+public:
+    using value_type = partition_snapshot;
+    partition_snapshot_ptr() = default;
+    partition_snapshot_ptr(partition_snapshot_ptr&&) = default;
+    partition_snapshot_ptr(const partition_snapshot_ptr&) = default;
+    partition_snapshot_ptr(lw_shared_ptr<partition_snapshot> snp) : _snp(std::move(snp)) {}
+    ~partition_snapshot_ptr();
+    partition_snapshot_ptr& operator=(partition_snapshot_ptr&& other) noexcept {
+        if (this != &other) {
+            this->~partition_snapshot_ptr();
+            new (this) partition_snapshot_ptr(std::move(other));
+        }
+        return *this;
+    }
+    partition_snapshot_ptr& operator=(const partition_snapshot_ptr& other) noexcept {
+        if (this != &other) {
+            this->~partition_snapshot_ptr();
+            new (this) partition_snapshot_ptr(other);
+        }
+        return *this;
+    }
+    partition_snapshot& operator*() { return *_snp; }
+    const partition_snapshot& operator*() const { return *_snp; }
+    partition_snapshot* operator->() { return &*_snp; }
+    const partition_snapshot* operator->() const { return &*_snp; }
+    explicit operator bool() const { return bool(_snp); }
+};
+
 class real_dirty_memory_accounter;

 // Represents mutation_partition with snapshotting support a la MVCC.
@@ -523,7 +564,7 @@ public:
    void upgrade(schema_ptr from, schema_ptr to, mutation_cleaner&, cache_tracker*);

    // Snapshots with different values of phase will point to different partition_version objects.
-    lw_shared_ptr<partition_snapshot> read(logalloc::region& region,
+    partition_snapshot_ptr read(logalloc::region& region,
        mutation_cleaner&,
        schema_ptr entry_schema,
        cache_tracker*,
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -121,7 +121,7 @@ public:
    position_in_partition_view(const clustering_key_prefix& ck)
        : _type(partition_region::clustered), _ck(&ck) { }
    position_in_partition_view(range_tag_t, bound_view bv)
-        : _type(partition_region::clustered), _bound_weight(position_weight(bv.kind)), _ck(&bv.prefix) { }
+        : _type(partition_region::clustered), _bound_weight(position_weight(bv.kind())), _ck(&bv.prefix()) { }

    static position_in_partition_view for_range_start(const query::clustering_range& r) {
        return {position_in_partition_view::range_tag_t(), bound_view::from_range_start(r)};
@@ -214,7 +214,7 @@ public:
    position_in_partition(before_clustering_row_tag_t, clustering_key_prefix ck)
        : _type(partition_region::clustered), _bound_weight(-1), _ck(std::move(ck)) { }
    position_in_partition(range_tag_t, bound_view bv)
-        : _type(partition_region::clustered), _bound_weight(position_weight(bv.kind)), _ck(bv.prefix) { }
+        : _type(partition_region::clustered), _bound_weight(position_weight(bv.kind())), _ck(bv.prefix()) { }
    position_in_partition(after_static_row_tag_t) :
        position_in_partition(range_tag_t(), bound_view::bottom()) { }
    explicit position_in_partition(position_in_partition_view view)
@@ -273,11 +273,6 @@ public:
        return is_partition_end() || (_ck && _ck->is_empty(s) && _bound_weight > 0);
    }

-    bool is_before_all_clustered_rows(const schema& s) const {
-        return _type < partition_region::clustered
-               || (_type == partition_region::clustered && _ck->is_empty(s) && _bound_weight < 0);
-    }
-
    template<typename Hasher>
    void feed_hash(Hasher& hasher, const schema& s) const {
        ::feed_hash(hasher, _bound_weight);
--- a/Show More
+++ b/Show More