materialized_views: propagate "view virtual columns" between nodes

db::schema_tables::ALL and db::schema_tables::all_tables() are both supposed to list the same schema tables - the former is the list of their names, and the latter is the list of their schemas. This code duplication makes it easy to forget to update one of them, and indeed recently the new "view_virtual_columns" was added to all_tables() but not to ALL. What this patch does is to make ALL a function instead of constant vector. The newly named all_table_names() function uses all_tables() so the list of schema tables only appears once. So that nobody worries about the performance impact, all_table_names() caches the list in a per-thread vector that is only prepared once per thread. Because after this patch all_table_names() has the "view_virtual_columns" that was previously missing, this patch also fixes #4339, which was about virtual columns in materialized views not being propagated to other nodes. Unfortunately, to test the fix for #4339 we need a test with multiple nodes, so we cannot test it here in a unit test, and will instead use the dtest framework, in a separate patch. Fixes #4339 Branches: 3.0 Tests: all unit tests (release and debug mode), new dtest for #4339. The unit test mutation_reader_test failed in debug mode but not in release mode, but this probably has nothing to do with this patch (?). Signed-off-by: Nadav Har'El <nyh@scylladb.com> Message-Id: <20190320063437.32731-1-nyh@scylladb.com> (cherry picked from commit 7c874057f5)
cql: alter type: Format field name as text instead of hex
2020-01-06 00:37:59 +02:00 · 2020-01-05 18:55:40 +02:00 · 2020-01-05 18:50:27 +02:00 · 2019-12-24 18:42:33 +02:00 · 2019-12-24 17:44:40 +02:00 · 2019-12-24 17:44:40 +02:00
783 changed files with 30820 additions and 10523 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,9 +6,9 @@
 	path = swagger-ui
 	url = ../scylla-swagger-ui
 	ignore = dirty
-[submodule "dist/ami/files/scylla-ami"]
-	path = dist/ami/files/scylla-ami
-	url = ../scylla-ami
 [submodule "xxHash"]
 	path = xxHash
 	url = ../xxHash
+[submodule "libdeflate"]
+	path = libdeflate
+	url = ../libdeflate
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,4 +138,5 @@ target_include_directories(scylla PUBLIC
        ${SEASTAR_INCLUDE_DIRS}
        ${Boost_INCLUDE_DIRS}
        xxhash
+        libdeflate
        build/release/gen)
--- a/HACKING.md
+++ b/HACKING.md
@@ -20,7 +20,7 @@ $ git submodule update --init --recursive

 Scylla depends on the system package manager for its development dependencies.

-Running `./install_dependencies.sh` (as root) installs the appropriate packages based on your Linux distribution.
+Running `./install-dependencies.sh` (as root) installs the appropriate packages based on your Linux distribution.

 ### Build system

--- a/README.md
+++ b/README.md
@@ -50,12 +50,12 @@ Then, to build an RPM, run:
 ./dist/redhat/build_rpm.sh
 ```

-The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
+The built RPM is stored in the ``build/mock/<configuration>/result`` directory.
 For example, on Fedora 21 mock reports the following:

 ```
 INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
-INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
+INFO: Results and/or logs in: build/mock/fedora-21-x86_64/result
 ```

 ## Building Fedora-based Docker image
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=2.3.6
+VERSION=3.0.11

 if test -f version
 then
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -78,15 +78,17 @@ void set_storage_service(http_context& ctx, routes& r) {
        });
    });

-    ss::get_tokens.set(r, [] (const_req req) {
-        auto tokens = service::get_local_storage_service().get_token_metadata().sorted_tokens();
-        return container_to_vec(tokens);
+    ss::get_tokens.set(r, [] (std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().sorted_tokens(), [](const dht::token& i) {
+           return boost::lexical_cast<std::string>(i);
+        }));
    });

-    ss::get_node_tokens.set(r, [] (const_req req) {
-        gms::inet_address addr(req.param["endpoint"]);
-        auto tokens = service::get_local_storage_service().get_token_metadata().get_tokens(addr);
-        return container_to_vec(tokens);
+    ss::get_node_tokens.set(r, [] (std::unique_ptr<request> req) {
+        gms::inet_address addr(req->param["endpoint"]);
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().get_tokens(addr), [](const dht::token& i) {
+           return boost::lexical_cast<std::string>(i);
+       }));
    });

    ss::get_commitlog.set(r, [&ctx](const_req req) {
@@ -107,11 +109,7 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
-        auto points = service::get_local_storage_service().get_token_metadata().get_moving_endpoints();
        std::unordered_set<sstring> addr;
-        for (auto i: points) {
-            addr.insert(boost::lexical_cast<std::string>(i.second));
-        }
        return container_to_vec(addr);
    });

--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -47,6 +47,23 @@ atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_typ
    );
 }

+atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value, atomic_cell::collection_member cm) {
+    auto& imr_data = type.imr_state();
+    return atomic_cell(
+        imr_data.type_info(),
+        imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, bool(cm)), &imr_data.lsa_migrator())
+    );
+}
+
+atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value, collection_member cm)
+{
+    auto& imr_data = type.imr_state();
+    return atomic_cell(
+        imr_data.type_info(),
+        imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, bool(cm)), &imr_data.lsa_migrator())
+    );
+}
+
 atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, bytes_view value,
                             gc_clock::time_point expiry, gc_clock::duration ttl, atomic_cell::collection_member cm) {
    auto& imr_data = type.imr_state();
@@ -56,6 +73,25 @@ atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_typ
    );
 }

+atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
+                             gc_clock::time_point expiry, gc_clock::duration ttl, atomic_cell::collection_member cm) {
+    auto& imr_data = type.imr_state();
+    return atomic_cell(
+        imr_data.type_info(),
+        imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, expiry, ttl, bool(cm)), &imr_data.lsa_migrator())
+    );
+}
+
+atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value,
+                                   gc_clock::time_point expiry, gc_clock::duration ttl, collection_member cm)
+{
+    auto& imr_data = type.imr_state();
+    return atomic_cell(
+        imr_data.type_info(),
+        imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, expiry, ttl, bool(cm)), &imr_data.lsa_migrator())
+    );
+}
+
 atomic_cell atomic_cell::make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
    auto& imr_data = no_type_imr_descriptor();
    return atomic_cell(
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -33,6 +33,9 @@
 #include "data/cell.hh"
 #include "data/schema_info.hh"
 #include "imr/utils.hh"
+#include "utils/fragmented_temporary_buffer.hh"
+
+#include "serializer.hh"

 class abstract_type;
 class collection_type_impl;
@@ -186,6 +189,10 @@ public:
    static atomic_cell make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time);
    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, bytes_view value,
                                 collection_member = collection_member::no);
+    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
+                                 collection_member = collection_member::no);
+    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value,
+                                 collection_member = collection_member::no);
    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const bytes& value,
                                 collection_member cm = collection_member::no) {
        return make_live(type, timestamp, bytes_view(value), cm);
@@ -193,6 +200,10 @@ public:
    static atomic_cell make_live_counter_update(api::timestamp_type timestamp, int64_t value);
    static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, bytes_view value,
        gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
+    static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
+        gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
+    static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value,
+        gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const bytes& value,
                                 gc_clock::time_point expiry, gc_clock::duration ttl, collection_member cm = collection_member::no)
    {
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -87,11 +87,17 @@ future<> create_metadata_table_if_missing(
    return mm.announce_new_column_family(b.build(), false);
 }

-future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db) {
+future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
    static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };

-    return do_until([&db] { return db.get_version() != database::empty_version; }, pause).then([&mm] {
-        return do_until([&mm] { return mm.have_schema_agreement(); }, pause);
+    return do_until([&db, &as] {
+        as.check();
+        return db.get_version() != database::empty_version;
+    }, pause).then([&mm, &as] {
+        return do_until([&mm, &as] {
+            as.check();
+            return mm.have_schema_agreement();
+        }, pause);
    });
 }

--- a/auth/common.hh
+++ b/auth/common.hh
@@ -81,7 +81,7 @@ future<> create_metadata_table_if_missing(
        stdx::string_view cql,
        ::service::migration_manager&);

-future<> wait_for_schema_agreement(::service::migration_manager&, const database&);
+future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);

 ///
 /// Time-outs for internal, non-local CQL queries.
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -160,7 +160,7 @@ future<> default_authorizer::start() {
                _migration_manager).then([this] {
            _finished = do_after_system_ready(_as, [this] {
                return async([this] {
-                    wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                    if (legacy_metadata_exists()) {
                        if (!any_granted().get0()) {
@@ -178,7 +178,7 @@ future<> default_authorizer::start() {

 future<> default_authorizer::stop() {
    _as.request_abort();
-    return _finished.handle_exception_type([](const sleep_aborted&) {});
+    return _finished.handle_exception_type([](const sleep_aborted&) {}).handle_exception_type([](const abort_requested_exception&) {});
 }

 future<permission_set>
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -41,11 +41,6 @@

 #include "auth/password_authenticator.hh"

-extern "C" {
-#include <crypt.h>
-#include <unistd.h>
-}
-
 #include <algorithm>
 #include <chrono>
 #include <random>
@@ -55,6 +50,7 @@ extern "C" {

 #include "auth/authenticated_user.hh"
 #include "auth/common.hh"
+#include "auth/passwords.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/untyped_result_set.hh"
 #include "log.hh"
@@ -82,6 +78,8 @@ static const class_registrator<
        cql3::query_processor&,
        ::service::migration_manager&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");

+static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());
+
 password_authenticator::~password_authenticator() {
 }

@@ -91,78 +89,6 @@ password_authenticator::password_authenticator(cql3::query_processor& qp, ::serv
    , _stopped(make_ready_future<>()) {
 }

-// TODO: blowfish
-// Origin uses Java bcrypt library, i.e. blowfish salt
-// generation and hashing, which is arguably a "better"
-// password hash than sha/md5 versions usually available in
-// crypt_r. Otoh, glibc 2.7+ uses a modified sha512 algo
-// which should be the same order of safe, so the only
-// real issue should be salted hash compatibility with
-// origin if importing system tables from there.
-//
-// Since bcrypt/blowfish is _not_ (afaict) not available
-// as a dev package/lib on most linux distros, we'd have to
-// copy and compile for example OWL  crypto
-// (http://cvsweb.openwall.com/cgi/cvsweb.cgi/Owl/packages/glibc/crypt_blowfish/)
-// to be fully bit-compatible.
-//
-// Until we decide this is needed, let's just use crypt_r,
-// and some old-fashioned random salt generation.
-
-static constexpr size_t rand_bytes = 16;
-static thread_local crypt_data tlcrypt = { 0, };
-
-static sstring hashpw(const sstring& pass, const sstring& salt) {
-    auto res = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
-    if (res == nullptr) {
-        throw std::system_error(errno, std::system_category());
-    }
-    return res;
-}
-
-static bool checkpw(const sstring& pass, const sstring& salted_hash) {
-    auto tmp = hashpw(pass, salted_hash);
-    return tmp == salted_hash;
-}
-
-static sstring gensalt() {
-    static sstring prefix;
-
-    std::random_device rd;
-    std::default_random_engine e1(rd());
-    std::uniform_int_distribution<char> dist;
-
-    sstring valid_salt = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./";
-    sstring input(rand_bytes, 0);
-
-    for (char&c : input) {
-        c = valid_salt[dist(e1) % valid_salt.size()];
-    }
-
-    sstring salt;
-
-    if (!prefix.empty()) {
-        return prefix + input;
-    }
-
-    // Try in order:
-    // blowfish 2011 fix, blowfish, sha512, sha256, md5
-    for (sstring pfx : { "$2y$", "$2a$", "$6$", "$5$", "$1$" }) {
-        salt = pfx + input;
-        const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
-
-        if (e && (e[0] != '*')) {
-            prefix = pfx;
-            return salt;
-        }
-    }
-    throw std::runtime_error("Could not initialize hashing algorithm");
-}
-
-static sstring hashpw(const sstring& pass) {
-    return hashpw(pass, gensalt());
-}
-
 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
    return !row.get_or<sstring>(SALTED_HASH, "").empty();
 }
@@ -212,7 +138,7 @@ future<> password_authenticator::create_default_if_missing() const {
                    update_row_query,
                    db::consistency_level::QUORUM,
                    internal_distributed_timeout_config(),
-                    {hashpw(DEFAULT_USER_PASSWORD), DEFAULT_USER_NAME}).then([](auto&&) {
+                    {passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt), DEFAULT_USER_NAME}).then([](auto&&) {
                plogger.info("Created default superuser authentication record.");
            });
        }
@@ -223,8 +149,6 @@ future<> password_authenticator::create_default_if_missing() const {

 future<> password_authenticator::start() {
     return once_among_shards([this] {
-         gensalt(); // do this once to determine usable hashing
-
         auto f = create_metadata_table_if_missing(
                 meta::roles_table::name,
                 _qp,
@@ -233,7 +157,7 @@ future<> password_authenticator::start() {

         _stopped = do_after_system_ready(_as, [this] {
             return async([this] {
-                 wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                 wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                 if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
                     if (legacy_metadata_exists()) {
@@ -258,7 +182,7 @@ future<> password_authenticator::start() {

 future<> password_authenticator::stop() {
    _as.request_abort();
-    return _stopped.handle_exception_type([] (const sleep_aborted&) { });
+    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
 }

 db::consistency_level password_authenticator::consistency_for_user(stdx::string_view role_name) {
@@ -321,7 +245,7 @@ future<authenticated_user> password_authenticator::authenticate(
            if (!res->empty()) {
                salted_hash = res->one().get_opt<sstring>(SALTED_HASH);
            }
-            if (!salted_hash || !checkpw(password, *salted_hash)) {
+            if (!salted_hash || !passwords::check(password, *salted_hash)) {
                throw exceptions::authentication_exception("Username and/or password are incorrect");
            }
            return make_ready_future<authenticated_user>(username);
@@ -344,7 +268,7 @@ future<> password_authenticator::create(stdx::string_view role_name, const authe
            update_row_query,
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
-            {hashpw(*options.password), sstring(role_name)}).discard_result();
+            {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}).discard_result();
 }

 future<> password_authenticator::alter(stdx::string_view role_name, const authentication_options& options) const {
@@ -362,7 +286,7 @@ future<> password_authenticator::alter(stdx::string_view role_name, const authen
            query,
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
-            {hashpw(*options.password), sstring(role_name)}).discard_result();
+            {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}).discard_result();
 }

 future<> password_authenticator::drop(stdx::string_view name) const {
--- a/auth/passwords.cc
+++ b/auth/passwords.cc
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "auth/passwords.hh"
+
+#include <cerrno>
+#include <optional>
+
+extern "C" {
+#include <crypt.h>
+#include <unistd.h>
+}
+
+namespace auth::passwords {
+
+static thread_local crypt_data tlcrypt = { 0, };
+
+namespace detail {
+
+scheme identify_best_supported_scheme() {
+    const auto all_schemes = { scheme::bcrypt_y, scheme::bcrypt_a, scheme::sha_512, scheme::sha_256, scheme::md5 };
+    // "Random", for testing schemes.
+    const sstring random_part_of_salt = "aaaabbbbccccdddd";
+
+    for (scheme c : all_schemes) {
+        const sstring salt = sstring(prefix_for_scheme(c)) + random_part_of_salt;
+        const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
+
+        if (e && (e[0] != '*')) {
+            return c;
+        }
+    }
+
+    throw no_supported_schemes();
+}
+
+sstring hash_with_salt(const sstring& pass, const sstring& salt) {
+    auto res = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
+    if (!res || (res[0] == '*')) {
+        throw std::system_error(errno, std::system_category());
+    }
+    return res;
+}
+
+const char* prefix_for_scheme(scheme c) noexcept {
+    switch (c) {
+    case scheme::bcrypt_y: return "$2y$";
+    case scheme::bcrypt_a: return "$2a$";
+    case scheme::sha_512: return "$6$";
+    case scheme::sha_256: return "$5$";
+    case scheme::md5: return "$1$";
+    default: return nullptr;
+    }
+}
+
+} // namespace detail
+
+no_supported_schemes::no_supported_schemes()
+        : std::runtime_error("No allowed hashing schemes are supported on this system") {
+}
+
+bool check(const sstring& pass, const sstring& salted_hash) {
+    return detail::hash_with_salt(pass, salted_hash) == salted_hash;
+}
+
+} // namespace auth::paswords
--- a/auth/passwords.hh
+++ b/auth/passwords.hh
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <random>
+#include <stdexcept>
+
+#include <seastar/core/sstring.hh>
+
+#include "seastarx.hh"
+
+namespace auth::passwords {
+
+class no_supported_schemes : public std::runtime_error {
+public:
+    no_supported_schemes();
+};
+
+///
+/// Apache Cassandra uses a library to provide the bcrypt scheme. Many Linux implementations do not support bcrypt, so
+/// we support alternatives. The cost is loss of direct compatibility with Apache Cassandra system tables.
+///
+enum class scheme {
+    bcrypt_y,
+    bcrypt_a,
+    sha_512,
+    sha_256,
+    md5
+};
+
+namespace detail {
+
+template <typename RandomNumberEngine>
+sstring generate_random_salt_bytes(RandomNumberEngine& g) {
+    static const sstring valid_bytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./";
+    static constexpr std::size_t num_bytes = 16;
+    std::uniform_int_distribution<std::size_t> dist(0, valid_bytes.size() - 1);
+    sstring result(num_bytes, 0);
+
+    for (char& c : result) {
+        c = valid_bytes[dist(g)];
+    }
+
+    return result;
+}
+
+///
+/// Test each allowed hashing scheme and report the best supported one on the current system.
+///
+/// \throws \ref no_supported_schemes when none of the known schemes is supported.
+///
+scheme identify_best_supported_scheme();
+
+const char* prefix_for_scheme(scheme) noexcept;
+
+///
+/// Generate a implementation-specific salt string for hashing passwords.
+///
+/// The `RandomNumberEngine` is used to generate the string, which is an implementation-specific length.
+///
+/// \throws \ref no_supported_schemes when no known hashing schemes are supported on the system.
+///
+template <typename RandomNumberEngine>
+sstring generate_salt(RandomNumberEngine& g) {
+    static const scheme scheme = identify_best_supported_scheme();
+    static const sstring prefix = sstring(prefix_for_scheme(scheme));
+    return prefix + generate_random_salt_bytes(g);
+}
+
+///
+/// Hash a password combined with an implementation-specific salt string.
+///
+/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
+///
+sstring hash_with_salt(const sstring& pass, const sstring& salt);
+
+} // namespace detail
+
+///
+/// Run a one-way hashing function on cleartext to produce encrypted text.
+///
+/// Prior to applying the hashing function, random salt is amended to the cleartext. The random salt bytes are generated
+/// according to the random number engine `g`.
+///
+/// The result is the encrypted cyphertext, and also the salt used but in a implementation-specific format.
+///
+/// \throws \ref std::system_error when the implementation-specific implementation fails to hash the cleartext.
+///
+template <typename RandomNumberEngine>
+sstring hash(const sstring& pass, RandomNumberEngine& g) {
+    return detail::hash_with_salt(pass, detail::generate_salt(g));
+}
+
+///
+/// Check that cleartext matches previously hashed cleartext with salt.
+///
+/// \ref salted_hash is the result of invoking \ref hash, which is the implementation-specific combination of the hashed
+/// password and the salt that was generated for it.
+///
+/// \returns `true` if the cleartext matches the salted hash.
+///
+/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
+///
+bool check(const sstring& pass, const sstring& salted_hash);
+
+} // namespace auth::passwords
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -184,7 +184,9 @@ future<> service::start() {
    return once_among_shards([this] {
        return create_keyspace_if_missing();
    }).then([this] {
-        return when_all_succeed(_role_manager->start(), _authorizer->start(), _authenticator->start());
+        return _role_manager->start().then([this] {
+            return when_all_succeed(_authorizer->start(), _authenticator->start());
+        });
    }).then([this] {
        _permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
    }).then([this] {
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -227,7 +227,7 @@ future<> standard_role_manager::start() {
        return this->create_metadata_tables_if_missing().then([this] {
            _stopped = auth::do_after_system_ready(_as, [this] {
                return seastar::async([this] {
-                    wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                    if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
                        if (this->legacy_metadata_exists()) {
@@ -251,7 +251,7 @@ future<> standard_role_manager::start() {

 future<> standard_role_manager::stop() {
    _as.request_abort();
-    return _stopped.handle_exception_type([] (const sleep_aborted&) { });
+    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});;
 }

 future<> standard_role_manager::create_or_replace(stdx::string_view role_name, const role_config& c) const {
--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -77,7 +77,7 @@ protected:
        , _io_priority(iop)
        , _interval(interval)
        , _update_timer([this] { adjust(); })
-        , _control_points({{0,0}})
+        , _control_points()
        , _current_backlog(std::move(backlog))
        , _inflight_update(make_ready_future<>())
    {
@@ -125,7 +125,7 @@ public:
    flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
    flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
        : backlog_controller(sg, iop, std::move(interval),
-          std::vector<backlog_controller::control_point>({{soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
+          std::vector<backlog_controller::control_point>({{0.0, 0.0}, {soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
          std::move(current_dirty)
        )
    {}
@@ -139,7 +139,7 @@ public:
    compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
    compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, std::function<float()> current_backlog)
        : backlog_controller(sg, iop, std::move(interval),
-          std::vector<backlog_controller::control_point>({{0.5, 10}, {1.5, 100} , {normalization_factor, 1000}}),
+          std::vector<backlog_controller::control_point>({{0.0, 50}, {1.5, 100} , {normalization_factor, 1000}}),
          std::move(current_backlog)
        )
    {}
--- a/bytes.hh
+++ b/bytes.hh
@@ -35,6 +35,10 @@ using bytes_mutable_view = basic_mutable_view<bytes_view::value_type>;
 using bytes_opt = std::experimental::optional<bytes>;
 using sstring_view = std::experimental::string_view;

+inline sstring_view to_sstring_view(bytes_view view) {
+    return {reinterpret_cast<const char*>(view.data()), view.size()};
+}
+
 namespace std {

 template <>
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -38,7 +38,7 @@ class bytes_ostream {
 public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
-    static constexpr size_type max_chunk_size() { return 16 * 1024; }
+    static constexpr size_type max_chunk_size() { return 128 * 1024; }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
    struct chunk {
@@ -57,12 +57,12 @@ private:
        value_type data[0];
        void operator delete(void* ptr) { free(ptr); }
    };
-    // FIXME: consider increasing chunk size as the buffer grows
-    static constexpr size_type chunk_size{512};
+    static constexpr size_type default_chunk_size{512};
 private:
    std::unique_ptr<chunk> _begin;
    chunk* _current;
    size_type _size;
+    size_type _initial_chunk_size = default_chunk_size;
 public:
    class fragment_iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
        chunk* _current = nullptr;
@@ -102,13 +102,13 @@ private:
    }
    // Figure out next chunk size.
    //   - must be enough for data_size
-    //   - must be at least chunk_size
+    //   - must be at least _initial_chunk_size
    //   - try to double each time to prevent too many allocations
    //   - do not exceed max_chunk_size
    size_type next_alloc_size(size_t data_size) const {
        auto next_size = _current
                ? _current->size * 2
-                : chunk_size;
+                : _initial_chunk_size;
        next_size = std::min(next_size, max_chunk_size());
        // FIXME: check for overflow?
        return std::max<size_type>(next_size, data_size + sizeof(chunk));
@@ -116,13 +116,19 @@ private:
    // Makes room for a contiguous region of given size.
    // The region is accounted for as already written.
    // size must not be zero.
+    [[gnu::always_inline]]
    value_type* alloc(size_type size) {
-        if (size <= current_space_left()) {
+        if (__builtin_expect(size <= current_space_left(), true)) {
            auto ret = _current->data + _current->offset;
            _current->offset += size;
            _size += size;
            return ret;
        } else {
+            return alloc_new(size);
+        }
+    }
+    [[gnu::noinline]]
+    value_type* alloc_new(size_type size) {
            auto alloc_size = next_alloc_size(size);
            auto space = malloc(alloc_size);
            if (!space) {
@@ -140,19 +146,22 @@ private:
            }
            _size += size;
            return _current->data;
-        };
    }
 public:
-    bytes_ostream() noexcept
+    explicit bytes_ostream(size_t initial_chunk_size) noexcept
        : _begin()
        , _current(nullptr)
        , _size(0)
+        , _initial_chunk_size(initial_chunk_size)
    { }

+    bytes_ostream() noexcept : bytes_ostream(default_chunk_size) {}
+
    bytes_ostream(bytes_ostream&& o) noexcept
        : _begin(std::move(o._begin))
        , _current(o._current)
        , _size(o._size)
+        , _initial_chunk_size(o._initial_chunk_size)
    {
        o._current = nullptr;
        o._size = 0;
@@ -162,6 +171,7 @@ public:
        : _begin()
        , _current(nullptr)
        , _size(0)
+        , _initial_chunk_size(o._initial_chunk_size)
    {
        append(o);
    }
@@ -199,18 +209,20 @@ public:
        return place_holder<T>{alloc(sizeof(T))};
    }

+    [[gnu::always_inline]]
    value_type* write_place_holder(size_type size) {
        return alloc(size);
    }

    // Writes given sequence of bytes
+    [[gnu::always_inline]]
    inline void write(bytes_view v) {
        if (v.empty()) {
            return;
        }

        auto this_size = std::min(v.size(), size_t(current_space_left()));
-        if (this_size) {
+        if (__builtin_expect(this_size, true)) {
            memcpy(_current->data + _current->offset, v.begin(), this_size);
            _current->offset += this_size;
            _size += this_size;
@@ -219,11 +231,12 @@ public:

        while (!v.empty()) {
            auto this_size = std::min(v.size(), size_t(max_chunk_size()));
-            std::copy_n(v.begin(), this_size, alloc(this_size));
+            std::copy_n(v.begin(), this_size, alloc_new(this_size));
            v.remove_prefix(this_size);
        }
    }

+    [[gnu::always_inline]]
    void write(const char* ptr, size_t size) {
        write(bytes_view(reinterpret_cast<const signed char*>(ptr), size));
    }
@@ -393,6 +406,21 @@ public:
    bool operator!=(const bytes_ostream& other) const {
        return !(*this == other);
    }
+
+    // Makes this instance empty.
+    //
+    // The first buffer is not deallocated, so callers may rely on the
+    // fact that if they write less than the initial chunk size between
+    // the clear() calls then writes will not involve any memory allocations,
+    // except for the first write made on this instance.
+    void clear() {
+        if (_begin) {
+            _begin->offset = 0;
+            _size = 0;
+            _current = _begin.get();
+            _begin->next.reset();
+        }
+    }
 };

 template<>
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -61,11 +61,12 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
        // - _last_row points at a direct predecessor of the next row which is going to be read.
        //   Used for populating continuity.
        // - _population_range_starts_before_all_rows is set accordingly
+        // - _underlying is engaged and fast-forwarded
        reading_from_underlying,

        end_of_stream
    };
-    lw_shared_ptr<partition_snapshot> _snp;
+    partition_snapshot_ptr _snp;
    position_in_partition::tri_compare _position_cmp;

    query::clustering_key_filter_ranges _ck_ranges;
@@ -99,7 +100,13 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
    // forward progress is not guaranteed in case iterators are getting constantly invalidated.
    bool _lower_bound_changed = false;

+    // Points to the underlying reader conforming to _schema,
+    // either to *_underlying_holder or _read_context->underlying().underlying().
+    flat_mutation_reader* _underlying = nullptr;
+    std::optional<flat_mutation_reader> _underlying_holder;
+
    future<> do_fill_buffer(db::timeout_clock::time_point);
+    future<> ensure_underlying(db::timeout_clock::time_point);
    void copy_from_cache_to_buffer();
    future<> process_static_row(db::timeout_clock::time_point);
    void move_to_end();
@@ -137,7 +144,7 @@ public:
                               dht::decorated_key dk,
                               query::clustering_key_filter_ranges&& crr,
                               lw_shared_ptr<read_context> ctx,
-                               lw_shared_ptr<partition_snapshot> snp,
+                               partition_snapshot_ptr snp,
                               row_cache& cache)
        : flat_mutation_reader::impl(std::move(s))
        , _snp(std::move(snp))
@@ -157,9 +164,6 @@ public:
    cache_flat_mutation_reader(const cache_flat_mutation_reader&) = delete;
    cache_flat_mutation_reader(cache_flat_mutation_reader&&) = delete;
    virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override;
-    virtual ~cache_flat_mutation_reader() {
-        maybe_merge_versions(_snp, _lsa_manager.region(), _lsa_manager.read_section());
-    }
    virtual void next_partition() override {
        clear_buffer_to_next_partition();
        if (is_buffer_empty()) {
@@ -189,23 +193,22 @@ future<> cache_flat_mutation_reader::process_static_row(db::timeout_clock::time_
        return make_ready_future<>();
    } else {
        _read_context->cache().on_row_miss();
-        return _read_context->get_next_fragment(timeout).then([this] (mutation_fragment_opt&& sr) {
-            if (sr) {
-                assert(sr->is_static_row());
-                maybe_add_to_cache(sr->as_static_row());
-                push_mutation_fragment(std::move(*sr));
-            }
-            maybe_set_static_row_continuous();
+        return ensure_underlying(timeout).then([this, timeout] {
+            return (*_underlying)(timeout).then([this] (mutation_fragment_opt&& sr) {
+                if (sr) {
+                    assert(sr->is_static_row());
+                    maybe_add_to_cache(sr->as_static_row());
+                    push_mutation_fragment(std::move(*sr));
+                }
+                maybe_set_static_row_continuous();
+            });
        });
    }
 }

 inline
 void cache_flat_mutation_reader::touch_partition() {
-    if (_snp->at_latest_version()) {
-        rows_entry& last_dummy = *_snp->version()->partition().clustered_rows().rbegin();
-        _snp->tracker()->touch(last_dummy);
-    }
+    _snp->touch();
 }

 inline
@@ -235,14 +238,36 @@ future<> cache_flat_mutation_reader::fill_buffer(db::timeout_clock::time_point t
    });
 }

+inline
+future<> cache_flat_mutation_reader::ensure_underlying(db::timeout_clock::time_point timeout) {
+    if (_underlying) {
+        return make_ready_future<>();
+    }
+    return _read_context->ensure_underlying(timeout).then([this, timeout] {
+        flat_mutation_reader& ctx_underlying = _read_context->underlying().underlying();
+        if (ctx_underlying.schema() != _schema) {
+            _underlying_holder = make_delegating_reader(ctx_underlying);
+            _underlying_holder->upgrade_schema(_schema);
+            _underlying = &*_underlying_holder;
+        } else {
+            _underlying = &ctx_underlying;
+        }
+    });
+}
+
 inline
 future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
    if (_state == state::move_to_underlying) {
+        if (!_underlying) {
+            return ensure_underlying(timeout).then([this, timeout] {
+                return do_fill_buffer(timeout);
+            });
+        }
        _state = state::reading_from_underlying;
        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
-        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
+        return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
            return read_from_underlying(timeout);
        });
    }
@@ -283,7 +308,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin

 inline
 future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::time_point timeout) {
-    return consume_mutation_fragments_until(_read_context->underlying().underlying(),
+    return consume_mutation_fragments_until(*_underlying,
        [this] { return _state != state::reading_from_underlying || is_buffer_full(); },
        [this] (mutation_fragment mf) {
            _read_context->cache().on_row_miss();
@@ -678,7 +703,7 @@ inline flat_mutation_reader make_cache_flat_mutation_reader(schema_ptr s,
                                                            query::clustering_key_filter_ranges crr,
                                                            row_cache& cache,
                                                            lw_shared_ptr<cache::read_context> ctx,
-                                                            lw_shared_ptr<partition_snapshot> snp)
+                                                            partition_snapshot_ptr snp)
 {
    return make_flat_mutation_reader<cache::cache_flat_mutation_reader>(
        std::move(s), std::move(dk), std::move(crr), std::move(ctx), std::move(snp), cache);
--- a/cell_locking.hh
+++ b/cell_locking.hh
@@ -24,9 +24,9 @@
 #include <boost/intrusive/unordered_set.hpp>

 #include "utils/small_vector.hh"
-#include "fnv1a_hasher.hh"
 #include "mutation_fragment.hh"
 #include "mutation_partition.hh"
+#include "xx_hasher.hh"

 #include "db/timeout_clock.hh"

@@ -194,10 +194,10 @@ private:
            explicit hasher(const schema& s) : _schema(&s) { }

            size_t operator()(const cell_address& ca) const {
-                fnv1a_hasher hasher;
+                xx_hasher hasher;
                ca.position.feed_hash(hasher, *_schema);
                ::feed_hash(hasher, ca.id);
-                return hasher.finalize();
+                return static_cast<size_t>(hasher.finalize_uint64());
            }
            size_t operator()(const cell_entry& ce) const {
                return operator()(ce._address);
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -22,6 +22,7 @@

 #pragma once

+#include <functional>
 #include "keys.hh"
 #include "schema.hh"
 #include "range.hh"
@@ -43,22 +44,20 @@ bound_kind invert_kind(bound_kind k);
 int32_t weight(bound_kind k);

 class bound_view {
+    const static thread_local clustering_key _empty_prefix;
+    std::reference_wrapper<const clustering_key_prefix> _prefix;
+    bound_kind _kind;
 public:
-    const static thread_local clustering_key empty_prefix;
-    const clustering_key_prefix& prefix;
-    bound_kind kind;
    bound_view(const clustering_key_prefix& prefix, bound_kind kind)
-        : prefix(prefix)
-        , kind(kind)
+        : _prefix(prefix)
+        , _kind(kind)
    { }
    bound_view(const bound_view& other) noexcept = default;
-    bound_view& operator=(const bound_view& other) noexcept {
-        if (this != &other) {
-            this->~bound_view();
-            new (this) bound_view(other);
-        }
-        return *this;
-    }
+    bound_view& operator=(const bound_view& other) noexcept = default;
+
+    bound_kind kind() const { return _kind; }
+    const clustering_key_prefix& prefix() const { return _prefix; }
+
    struct tri_compare {
        // To make it assignable and to avoid taking a schema_ptr, we
        // wrap the schema reference.
@@ -82,13 +81,13 @@ public:
            return d1 < d2 ? w1 - (w1 <= 0) : -(w2 - (w2 <= 0));
        }
        int operator()(const bound_view b, const clustering_key_prefix& p) const {
-            return operator()(b.prefix, weight(b.kind), p, 0);
+            return operator()(b._prefix, weight(b._kind), p, 0);
        }
        int operator()(const clustering_key_prefix& p, const bound_view b) const {
-            return operator()(p, 0, b.prefix, weight(b.kind));
+            return operator()(p, 0, b._prefix, weight(b._kind));
        }
        int operator()(const bound_view b1, const bound_view b2) const {
-            return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
+            return operator()(b1._prefix, weight(b1._kind), b2._prefix, weight(b2._kind));
        }
    };
    struct compare {
@@ -101,26 +100,26 @@ public:
            return _cmp(p1, w1, p2, w2) < 0;
        }
        bool operator()(const bound_view b, const clustering_key_prefix& p) const {
-            return operator()(b.prefix, weight(b.kind), p, 0);
+            return operator()(b._prefix, weight(b._kind), p, 0);
        }
        bool operator()(const clustering_key_prefix& p, const bound_view b) const {
-            return operator()(p, 0, b.prefix, weight(b.kind));
+            return operator()(p, 0, b._prefix, weight(b._kind));
        }
        bool operator()(const bound_view b1, const bound_view b2) const {
-            return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
+            return operator()(b1._prefix, weight(b1._kind), b2._prefix, weight(b2._kind));
        }
    };
    bool equal(const schema& s, const bound_view other) const {
-        return kind == other.kind && prefix.equal(s, other.prefix);
+        return _kind == other._kind && _prefix.get().equal(s, other._prefix.get());
    }
    bool adjacent(const schema& s, const bound_view other) const {
-        return invert_kind(other.kind) == kind && prefix.equal(s, other.prefix);
+        return invert_kind(other._kind) == _kind && _prefix.get().equal(s, other._prefix.get());
    }
    static bound_view bottom() {
-        return {empty_prefix, bound_kind::incl_start};
+        return {_empty_prefix, bound_kind::incl_start};
    }
    static bound_view top() {
-        return {empty_prefix, bound_kind::incl_end};
+        return {_empty_prefix, bound_kind::incl_end};
    }
    template<template<typename> typename R>
    GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
@@ -144,13 +143,13 @@ public:
    template<template<typename> typename R>
    GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
    static stdx::optional<typename R<clustering_key_prefix_view>::bound> to_range_bound(const bound_view& bv) {
-        if (&bv.prefix == &empty_prefix) {
+        if (&bv._prefix.get() == &_empty_prefix) {
            return {};
        }
-        bool inclusive = bv.kind != bound_kind::excl_end && bv.kind != bound_kind::excl_start;
-        return {typename R<clustering_key_prefix_view>::bound(bv.prefix.view(), inclusive)};
+        bool inclusive = bv._kind != bound_kind::excl_end && bv._kind != bound_kind::excl_start;
+        return {typename R<clustering_key_prefix_view>::bound(bv._prefix.get().view(), inclusive)};
    }
    friend std::ostream& operator<<(std::ostream& out, const bound_view& b) {
-        return out << "{bound: prefix=" << b.prefix << ", kind=" << b.kind << "}";
+        return out << "{bound: prefix=" << b._prefix.get() << ", kind=" << b._kind << "}";
    }
 };
--- a/clustering_key_filter.hh
+++ b/clustering_key_filter.hh
@@ -30,7 +30,7 @@ namespace query {

 class clustering_key_filter_ranges {
    clustering_row_ranges _storage;
-    const clustering_row_ranges& _ref;
+    std::reference_wrapper<const clustering_row_ranges> _ref;
 public:
    clustering_key_filter_ranges(const clustering_row_ranges& ranges) : _ref(ranges) { }
    struct reversed { };
@@ -39,21 +39,21 @@ public:

    clustering_key_filter_ranges(clustering_key_filter_ranges&& other) noexcept
        : _storage(std::move(other._storage))
-        , _ref(&other._ref == &other._storage ? _storage : other._ref)
+        , _ref(&other._ref.get() == &other._storage ? _storage : other._ref.get())
    { }

    clustering_key_filter_ranges& operator=(clustering_key_filter_ranges&& other) noexcept {
        if (this != &other) {
-            this->~clustering_key_filter_ranges();
-            new (this) clustering_key_filter_ranges(std::move(other));
+            _storage = std::move(other._storage);
+            _ref = (&other._ref.get() == &other._storage) ? _storage : other._ref.get();
        }
        return *this;
    }

-    auto begin() const { return _ref.begin(); }
-    auto end() const { return _ref.end(); }
-    bool empty() const { return _ref.empty(); }
-    size_t size() const { return _ref.size(); }
+    auto begin() const { return _ref.get().begin(); }
+    auto end() const { return _ref.get().end(); }
+    bool empty() const { return _ref.get().empty(); }
+    size_t size() const { return _ref.get().size(); }
    const clustering_row_ranges& ranges() const { return _ref; }

    static clustering_key_filter_ranges get_ranges(const schema& schema, const query::partition_slice& slice, const partition_key& key) {
--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -31,72 +31,61 @@
 class clustering_ranges_walker {
    const schema& _schema;
    const query::clustering_row_ranges& _ranges;
-    query::clustering_row_ranges::const_iterator _current;
-    query::clustering_row_ranges::const_iterator _end;
+    boost::iterator_range<query::clustering_row_ranges::const_iterator> _current_range;
    bool _in_current; // next position is known to be >= _current_start
    bool _with_static_row;
    position_in_partition_view _current_start;
    position_in_partition_view _current_end;
-    stdx::optional<position_in_partition> _trim;
+    std::optional<position_in_partition> _trim;
    size_t _change_counter = 1;
 private:
    bool advance_to_next_range() {
        _in_current = false;
        if (!_current_start.is_static_row()) {
-            if (_current == _end) {
+            if (!_current_range) {
                return false;
            }
-            ++_current;
+            _current_range.advance_begin(1);
        }
        ++_change_counter;
-        if (_current == _end) {
+        if (!_current_range) {
            _current_end = _current_start = position_in_partition_view::after_all_clustered_rows();
            return false;
        }
-        _current_start = position_in_partition_view::for_range_start(*_current);
-        _current_end = position_in_partition_view::for_range_end(*_current);
+        _current_start = position_in_partition_view::for_range_start(_current_range.front());
+        _current_end = position_in_partition_view::for_range_end(_current_range.front());
        return true;
    }
-public:
-    clustering_ranges_walker(const schema& s, const query::clustering_row_ranges& ranges, bool with_static_row = true)
-        : _schema(s)
-        , _ranges(ranges)
-        , _current(ranges.begin())
-        , _end(ranges.end())
-        , _in_current(with_static_row)
-        , _with_static_row(with_static_row)
-        , _current_start(position_in_partition_view::for_static_row())
-        , _current_end(position_in_partition_view::before_all_clustered_rows())
-    {
-        if (!with_static_row) {
-            if (_current == _end) {
+
+    void set_current_positions() {
+         if (!_with_static_row) {
+            if (!_current_range) {
                _current_start = position_in_partition_view::before_all_clustered_rows();
            } else {
-                _current_start = position_in_partition_view::for_range_start(*_current);
-                _current_end = position_in_partition_view::for_range_end(*_current);
+                _current_start = position_in_partition_view::for_range_start(_current_range.front());
+                _current_end = position_in_partition_view::for_range_end(_current_range.front());
            }
        }
    }
-    clustering_ranges_walker(clustering_ranges_walker&& o) noexcept
-        : _schema(o._schema)
-        , _ranges(o._ranges)
-        , _current(o._current)
-        , _end(o._end)
-        , _in_current(o._in_current)
-        , _with_static_row(o._with_static_row)
-        , _current_start(o._current_start)
-        , _current_end(o._current_end)
-        , _trim(std::move(o._trim))
-        , _change_counter(o._change_counter)
-    { }
-    clustering_ranges_walker& operator=(clustering_ranges_walker&& o) {
-        if (this != &o) {
-            this->~clustering_ranges_walker();
-            new (this) clustering_ranges_walker(std::move(o));
-        }
-        return *this;
+
+public:
+    clustering_ranges_walker(const schema& s, const query::clustering_row_ranges& ranges, bool with_static_row = true)
+            : _schema(s)
+            , _ranges(ranges)
+            , _current_range(ranges)
+            , _in_current(with_static_row)
+            , _with_static_row(with_static_row)
+            , _current_start(position_in_partition_view::for_static_row())
+            , _current_end(position_in_partition_view::before_all_clustered_rows()) {
+        set_current_positions();
    }

+    clustering_ranges_walker(const clustering_ranges_walker&) = delete;
+    clustering_ranges_walker(clustering_ranges_walker&&) = delete;
+
+    clustering_ranges_walker& operator=(const clustering_ranges_walker&) = delete;
+    clustering_ranges_walker& operator=(clustering_ranges_walker&&) = delete;
+
    // Excludes positions smaller than pos from the ranges.
    // pos should be monotonic.
    // No constraints between pos and positions passed to advance_to().
@@ -173,17 +162,15 @@ public:
            return false;
        }

-        auto i = _current;
-        while (i != _end) {
-            auto range_start = position_in_partition_view::for_range_start(*i);
+        for (const auto& rng : _current_range) {
+            auto range_start = position_in_partition_view::for_range_start(rng);
            if (!less(range_start, end)) {
                return false;
            }
-            auto range_end = position_in_partition_view::for_range_end(*i);
+            auto range_end = position_in_partition_view::for_range_end(rng);
            if (less(start, range_end)) {
                return true;
            }
-            ++i;
        }

        return false;
@@ -191,18 +178,20 @@ public:

    // Returns true if advanced past all contained positions. Any later advance_to() until reset() will return false.
    bool out_of_range() const {
-        return !_in_current && _current == _end;
+        return !_in_current && !_current_range;
    }

    // Resets the state of the walker so that advance_to() can be now called for new sequence of positions.
    // Any range trimmings still hold after this.
    void reset() {
-        auto trim = std::move(_trim);
-        auto ctr = _change_counter;
-        *this = clustering_ranges_walker(_schema, _ranges, _with_static_row);
-        _change_counter = ctr + 1;
-        if (trim) {
-            trim_front(std::move(*trim));
+        _current_range = _ranges;
+        _in_current = _with_static_row;
+        _current_start = position_in_partition_view::for_static_row();
+        _current_end = position_in_partition_view::before_all_clustered_rows();
+        set_current_positions();
+        ++_change_counter;
+        if (_trim) {
+            trim_front(*std::exchange(_trim, {}));
        }
    }

@@ -211,6 +200,11 @@ public:
        return _current_start;
    }

+    // Returns the upper bound of the last range in provided ranges set
+    position_in_partition_view uppermost_bound() const {
+        return position_in_partition_view::for_range_end(_ranges.back());
+    }
+
    // When lower_bound() changes, this also does
    // Always > 0.
    size_t lower_bound_change_counter() const {
--- a/compatible_ring_position.hh
+++ b/compatible_ring_position.hh
@@ -1,67 +0,0 @@
-/*
- * Copyright (C) 2016 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "query-request.hh"
-#include <experimental/optional>
-
-// Wraps ring_position so it is compatible with old-style C++: default constructor,
-// stateless comparators, yada yada
-class compatible_ring_position {
-    const schema* _schema = nullptr;
-    // optional to supply a default constructor, no more
-    std::experimental::optional<dht::ring_position> _rp;
-public:
-    compatible_ring_position() noexcept = default;
-    compatible_ring_position(const schema& s, const dht::ring_position& rp)
-            : _schema(&s), _rp(rp) {
-    }
-    compatible_ring_position(const schema& s, dht::ring_position&& rp)
-            : _schema(&s), _rp(std::move(rp)) {
-    }
-    const dht::token& token() const {
-        return _rp->token();
-    }
-    friend int tri_compare(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return x._rp->tri_compare(*x._schema, *y._rp);
-    }
-    friend bool operator<(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) < 0;
-    }
-    friend bool operator<=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) <= 0;
-    }
-    friend bool operator>(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) > 0;
-    }
-    friend bool operator>=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) >= 0;
-    }
-    friend bool operator==(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) == 0;
-    }
-    friend bool operator!=(const compatible_ring_position& x, const compatible_ring_position& y) {
-        return tri_compare(x, y) != 0;
-    }
-};
-
--- a/compatible_ring_position_view.hh
+++ b/compatible_ring_position_view.hh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "query-request.hh"
+#include <optional>
+
+// Wraps ring_position_view so it is compatible with old-style C++: default
+// constructor, stateless comparators, yada yada.
+class compatible_ring_position_view {
+    const schema* _schema = nullptr;
+    // Optional to supply a default constructor, no more.
+    std::optional<dht::ring_position_view> _rpv;
+public:
+    constexpr compatible_ring_position_view() = default;
+    compatible_ring_position_view(const schema& s, dht::ring_position_view rpv)
+        : _schema(&s), _rpv(rpv) {
+    }
+    const dht::ring_position_view& position() const {
+        return *_rpv;
+    }
+    friend int tri_compare(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return dht::ring_position_tri_compare(*x._schema, *x._rpv, *y._rpv);
+    }
+    friend bool operator<(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) < 0;
+    }
+    friend bool operator<=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) <= 0;
+    }
+    friend bool operator>(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) > 0;
+    }
+    friend bool operator>=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) >= 0;
+    }
+    friend bool operator==(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) == 0;
+    }
+    friend bool operator!=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
+        return tri_compare(x, y) != 0;
+    }
+};
+
--- a/compress.cc
+++ b/compress.cc
@@ -112,7 +112,7 @@ const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_kb";
 const sstring compression_parameters::CRC_CHECK_CHANCE = "crc_check_chance";

 compression_parameters::compression_parameters()
-    : compression_parameters(nullptr)
+    : compression_parameters(compressor::lz4)
 {}

 compression_parameters::~compression_parameters()
--- a/compress.hh
+++ b/compress.hh
@@ -118,6 +118,10 @@ public:
    std::map<sstring, sstring> get_options() const;
    bool operator==(const compression_parameters& other) const;
    bool operator!=(const compression_parameters& other) const;
+
+    static compression_parameters no_compression() {
+        return compression_parameters(nullptr);
+    }
 private:
    void validate_options(const std::map<sstring, sstring>&);
 };
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -242,6 +242,9 @@ batch_size_fail_threshold_in_kb: 50

 # The directory where hints files are stored if hinted handoff is enabled.
 # hints_directory: /var/lib/scylla/hints
+ 
+# The directory where hints files are stored for materialized-view updates
+# view_hints_directory: /var/lib/scylla/view_hints

 # See http://wiki.apache.org/cassandra/HintedHandoff
 # May either be "true" or "false" to enable globally, or contain a list
--- a/configure.py
+++ b/configure.py
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -38,44 +38,44 @@ private:
    static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
        return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
    }
+    static atomic_cell upgrade_cell(const abstract_type& new_type, const abstract_type& old_type, atomic_cell_view cell,
+                                    atomic_cell::collection_member cm = atomic_cell::collection_member::no) {
+        if (cell.is_live() && !old_type.is_counter()) {
+            if (cell.is_live_and_has_ttl()) {
+                return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl(), cm);
+            }
+            return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cm);
+        } else {
+            return atomic_cell(new_type, cell);
+        }
+    }
    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
        if (!is_compatible(new_def, old_type, kind) || cell.timestamp() <= new_def.dropped_at()) {
            return;
        }
-        auto new_cell = [&] {
-            if (cell.is_live() && !old_type->is_counter()) {
-                if (cell.is_live_and_has_ttl()) {
-                    return atomic_cell_or_collection(
-                        atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl())
-                    );
-                }
-                return atomic_cell_or_collection(
-                    atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize())
-                );
-            } else {
-                return atomic_cell_or_collection(*new_def.type, cell);
-            }
-        }();
-        dst.apply(new_def, std::move(new_cell));
+        dst.apply(new_def, upgrade_cell(*new_def.type, *old_type, cell));
    }
    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
        if (!is_compatible(new_def, old_type, kind)) {
            return;
        }
      cell.data.with_linearized([&] (bytes_view cell_bv) {
-        auto&& ctype = static_pointer_cast<const collection_type_impl>(old_type);
-        auto old_view = ctype->deserialize_mutation_form(cell_bv);
+        auto new_ctype = static_pointer_cast<const collection_type_impl>(new_def.type);
+        auto old_ctype = static_pointer_cast<const collection_type_impl>(old_type);
+        auto old_view = old_ctype->deserialize_mutation_form(cell_bv);

-        collection_type_impl::mutation_view new_view;
+        collection_type_impl::mutation new_view;
        if (old_view.tomb.timestamp > new_def.dropped_at()) {
            new_view.tomb = old_view.tomb;
        }
        for (auto& c : old_view.cells) {
            if (c.second.timestamp() > new_def.dropped_at()) {
-                new_view.cells.emplace_back(std::move(c));
+                new_view.cells.emplace_back(c.first, upgrade_cell(*new_ctype->value_comparator(), *old_ctype->value_comparator(), c.second, atomic_cell::collection_member::yes));
            }
        }
-        dst.apply(new_def, ctype->serialize_mutation_form(std::move(new_view)));
+        if (new_view.tomb || !new_view.cells.empty()) {
+            dst.apply(new_def, new_ctype->serialize_mutation_form(std::move(new_view)));
+        }
      });
    }
 public:
@@ -92,6 +92,10 @@ public:
        _p.apply(t);
    }

+    void accept_static_cell(column_id id, atomic_cell cell) {
+        return accept_static_cell(id, atomic_cell_view(cell));
+    }
+
    virtual void accept_static_cell(column_id id, atomic_cell_view cell) override {
        const column_mapping_entry& col = _visited_column_mapping.static_column_at(id);
        const column_definition* def = _p_schema.get_column_definition(col.name());
@@ -119,6 +123,10 @@ public:
        _current_row = &r;
    }

+    void accept_row_cell(column_id id, atomic_cell cell) {
+        return accept_row_cell(id, atomic_cell_view(cell));
+    }
+
    virtual void accept_row_cell(column_id id, atomic_cell_view cell) override {
        const column_mapping_entry& col = _visited_column_mapping.regular_column_at(id);
        const column_definition* def = _p_schema.get_column_definition(col.name());
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -470,6 +470,7 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
        std::vector<::shared_ptr<cql3::column_identifier::raw>> column_names;
        std::vector<::shared_ptr<cql3::term::raw>> values;
        bool if_not_exists = false;
+        bool default_unset = false;
        ::shared_ptr<cql3::term::raw> json_value;
    }
    : K_INSERT K_INTO cf=columnFamilyName
@@ -487,13 +488,15 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
              }
        | K_JSON
          json_token=jsonValue { json_value = $json_token.value; }
+            ( K_DEFAULT K_UNSET { default_unset = true; } | K_DEFAULT K_NULL )?
            ( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
            ( usingClause[attrs] )?
              {
              $expr = ::make_shared<raw::insert_json_statement>(std::move(cf),
                                                       std::move(attrs),
                                                       std::move(json_value),
-                                                       if_not_exists);
+                                                       if_not_exists,
+                                                       default_unset);
              }
        )
    ;
@@ -1531,12 +1534,22 @@ inMarkerForTuple returns [shared_ptr<cql3::tuples::in_raw> marker]
    | ':' name=ident { $marker = new_tuple_in_bind_variables(name); }
    ;

-comparatorType returns [shared_ptr<cql3_type::raw> t]
-    : n=native_type     { $t = cql3_type::raw::from(n); }
-    | c=collection_type { $t = c; }
-    | tt=tuple_type     { $t = tt; }
+// The comparator_type rule is used for users' queries (internal=false)
+// and for internal calls from db::cql_type_parser::parse() (internal=true).
+// The latter is used for reading schemas stored in the system tables, and
+// may support additional column types that cannot be created through CQL,
+// but only internally through code. Today the only such type is "empty":
+// Scylla code internally creates columns with type "empty" or collections
+// "empty" to represent unselected columns in materialized views.
+// If a user (internal=false) tries to use "empty" as a type, it is treated -
+// as do all unknown types - as an attempt to use a user-defined type, and
+// we report this name is reserved (as for _reserved_type_names()).
+comparator_type [bool internal] returns [shared_ptr<cql3_type::raw> t]
+    : n=native_or_internal_type[internal]     { $t = cql3_type::raw::from(n); }
+    | c=collection_type[internal]   { $t = c; }
+    | tt=tuple_type[internal]       { $t = tt; }
    | id=userTypeName   { $t = cql3::cql3_type::raw::user_type(id); }
-    | K_FROZEN '<' f=comparatorType '>'
+    | K_FROZEN '<' f=comparator_type[internal] '>'
      {
        try {
            $t = cql3::cql3_type::raw::frozen(f);
@@ -1558,6 +1571,22 @@ comparatorType returns [shared_ptr<cql3_type::raw> t]
 #endif
    ;

+native_or_internal_type [bool internal] returns [shared_ptr<cql3_type> t]
+    : n=native_type     { $t = n; }
+    // The "internal" types, only supported when internal==true:
+    | K_EMPTY   {
+        if (internal) {
+            $t = cql3_type::empty;
+        } else {
+            add_recognition_error("Invalid (reserved) user type name empty");
+        }
+      }
+    ;
+
+comparatorType returns [shared_ptr<cql3_type::raw> t]
+    : tt=comparator_type[false]    { $t = tt; }
+    ;
+
 native_type returns [shared_ptr<cql3_type> t]
    : K_ASCII     { $t = cql3_type::ascii; }
    | K_BIGINT    { $t = cql3_type::bigint; }
@@ -1582,24 +1611,24 @@ native_type returns [shared_ptr<cql3_type> t]
    | K_TIME      { $t = cql3_type::time; }
    ;

-collection_type returns [shared_ptr<cql3::cql3_type::raw> pt]
-    : K_MAP  '<' t1=comparatorType ',' t2=comparatorType '>'
+collection_type [bool internal] returns [shared_ptr<cql3::cql3_type::raw> pt]
+    : K_MAP  '<' t1=comparator_type[internal] ',' t2=comparator_type[internal] '>'
        {
            // if we can't parse either t1 or t2, antlr will "recover" and we may have t1 or t2 null.
            if (t1 && t2) {
                $pt = cql3::cql3_type::raw::map(t1, t2);
            }
        }
-    | K_LIST '<' t=comparatorType '>'
+    | K_LIST '<' t=comparator_type[internal] '>'
        { if (t) { $pt = cql3::cql3_type::raw::list(t); } }
-    | K_SET  '<' t=comparatorType '>'
+    | K_SET  '<' t=comparator_type[internal] '>'
        { if (t) { $pt = cql3::cql3_type::raw::set(t); } }
    ;

-tuple_type returns [shared_ptr<cql3::cql3_type::raw> t]
+tuple_type [bool internal] returns [shared_ptr<cql3::cql3_type::raw> t]
        @init{ std::vector<shared_ptr<cql3::cql3_type::raw>> types; }
    : K_TUPLE '<'
-         t1=comparatorType { types.push_back(t1); } (',' tn=comparatorType { types.push_back(tn); })*
+         t1=comparator_type[internal] { types.push_back(t1); } (',' tn=comparator_type[internal] { types.push_back(tn); })*
      '>' { $t = cql3::cql3_type::raw::tuple(std::move(types)); }
    ;

@@ -1625,7 +1654,7 @@ unreserved_keyword returns [sstring str]

 unreserved_function_keyword returns [sstring str]
    : u=basic_unreserved_keyword { $str = u; }
-    | t=native_type              { $str = t->to_string(); }
+    | t=native_or_internal_type[true]   { $str = t->to_string(); }
    ;

 basic_unreserved_keyword returns [sstring str]
@@ -1809,6 +1838,10 @@ K_OR:          O R;
 K_REPLACE:     R E P L A C E;
 K_DETERMINISTIC: D E T E R M I N I S T I C;
 K_JSON:        J S O N;
+K_DEFAULT:     D E F A U L T;
+K_UNSET:       U N S E T;
+
+K_EMPTY:       E M P T Y;

 K_SCYLLA_TIMEUUID_LIST_INDEX: S C Y L L A '_' T I M E U U I D '_' L I S T '_' I N D E X;
 K_SCYLLA_COUNTER_SHARD_LIST: S C Y L L A '_' C O U N T E R '_' S H A R D '_' L I S T; 
--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -77,12 +77,14 @@ int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (tval.is_unset_value()) {
        return now;
    }
+  return with_linearized(*tval, [] (bytes_view val) {
    try {
-        data_type_for<int64_t>()->validate(*tval);
+        data_type_for<int64_t>()->validate(val);
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid timestamp value");
    }
-    return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(*tval));
+    return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(val));
+  });
 }

 int32_t attributes::get_time_to_live(const query_options& options) {
@@ -96,14 +98,16 @@ int32_t attributes::get_time_to_live(const query_options& options) {
    if (tval.is_unset_value()) {
        return 0;
    }
+  auto ttl = with_linearized(*tval, [] (bytes_view val) {
    try {
-        data_type_for<int32_t>()->validate(*tval);
+        data_type_for<int32_t>()->validate(val);
    }
    catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid TTL value");
    }

-    auto ttl = value_cast<int32_t>(data_type_for<int32_t>()->deserialize(*tval));
+    return value_cast<int32_t>(data_type_for<int32_t>()->deserialize(val));
+  });
    if (ttl < 0) {
        throw exceptions::invalid_request_exception("A TTL must be greater or equal to 0");
    }
--- a/cql3/column_identifier.cc
+++ b/cql3/column_identifier.cc
@@ -127,7 +127,11 @@ column_identifier::new_selector_factory(database& db, schema_ptr schema, std::ve
    if (!def) {
        throw exceptions::invalid_request_exception(sprint("Undefined name %s in selection clause", _text));
    }
-
+    // Do not allow explicitly selecting hidden columns. We also skip them on
+    // "SELECT *" (see selection::wildcard()).
+    if (def->is_view_virtual()) {
+        throw exceptions::invalid_request_exception(sprint("Undefined name %s in selection clause", _text));
+    }
    return selection::simple_selector::new_factory(def->name_as_text(), add_and_get_index(*def, defs), def->type);
 }

--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -225,7 +225,9 @@ public:
            } else if (value.is_unset_value()) {
                return;
            }
-            auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
+            auto increment = with_linearized(*value, [] (bytes_view value_view) {
+                return value_cast<int64_t>(long_type->deserialize_value(value_view));
+            });
            m.set_cell(prefix, column, make_counter_update_cell(increment, params));
        }
    };
@@ -240,7 +242,9 @@ public:
            } else if (value.is_unset_value()) {
                return;
            }
-            auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
+            auto increment = with_linearized(*value, [] (bytes_view value_view) {
+                return value_cast<int64_t>(long_type->deserialize_value(value_view));
+            });
            if (increment == std::numeric_limits<int64_t>::min()) {
                throw exceptions::invalid_request_exception(sprint("The negation of %d overflows supported counter precision (signed 8 bytes integer)", increment));
            }
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -461,9 +461,9 @@ function_call::make_terminal(shared_ptr<function> fun, cql3::raw_value result, c
    }

    auto ctype = static_pointer_cast<const collection_type_impl>(fun->return_type());
-    bytes_view res;
+    fragmented_temporary_buffer::view res;
    if (result) {
-        res = *result;
+        res = fragmented_temporary_buffer::view(bytes_view(*result));
    }
    if (&ctype->_kind == &collection_type_impl::kind::list) {
        return make_shared(lists::value::from_serialized(std::move(res), static_pointer_cast<const list_type_impl>(ctype), sf));
--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -115,11 +115,12 @@ lists::literal::to_string() const {
 }

 lists::value
-lists::value::from_serialized(bytes_view v, list_type type, cql_serialization_format sf) {
+lists::value::from_serialized(const fragmented_temporary_buffer::view& val, list_type type, cql_serialization_format sf) {
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserializeForNativeProtocol()?!
+      return with_linearized(val, [&] (bytes_view v) {
        auto l = value_cast<list_type_impl::native_type>(type->deserialize(v, sf));
        std::vector<bytes_opt> elements;
        elements.reserve(l.size());
@@ -128,6 +129,7 @@ lists::value::from_serialized(bytes_view v, list_type type, cql_serialization_fo
            elements.push_back(element.is_null() ? bytes_opt() : bytes_opt(type->get_elements_type()->decompose(element)));
        }
        return value(std::move(elements));
+      });
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(e.what());
    }
@@ -285,7 +287,9 @@ lists::setter_by_index::execute(mutation& m, const clustering_key_prefix& prefix
        return;
    }

-    auto idx = net::ntoh(int32_t(*unaligned_cast<int32_t>(index->begin())));
+    auto idx = with_linearized(*index, [] (bytes_view v) {
+        return value_cast<int32_t>(data_type_for<int32_t>()->deserialize(v));
+    });
    auto&& existing_list_opt = params.get_prefetched_list(m.key().view(), prefix.view(), column);
    if (!existing_list_opt) {
        throw exceptions::invalid_request_exception("Attempted to set an element on a list which is null");
--- a/cql3/lists.hh
+++ b/cql3/lists.hh
@@ -79,7 +79,7 @@ public:
        explicit value(std::vector<bytes_opt> elements)
            : _elements(std::move(elements)) {
        }
-        static value from_serialized(bytes_view v, list_type type, cql_serialization_format sf);
+        static value from_serialized(const fragmented_temporary_buffer::view& v, list_type type, cql_serialization_format sf);
        virtual cql3::raw_value get(const query_options& options) override;
        virtual bytes get_with_protocol_version(cql_serialization_format sf) override;
        bool equals(shared_ptr<list_type_impl> lt, const value& v);
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -152,18 +152,20 @@ maps::literal::to_string() const {
 }

 maps::value
-maps::value::from_serialized(bytes_view value, map_type type, cql_serialization_format sf) {
+maps::value::from_serialized(const fragmented_temporary_buffer::view& fragmented_value, map_type type, cql_serialization_format sf) {
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserialize_for_native_protocol?!
+      return with_linearized(fragmented_value, [&] (bytes_view value) {
        auto m = value_cast<map_type_impl::native_type>(type->deserialize(value, sf));
        std::map<bytes, bytes, serialized_compare> map(type->get_keys_type()->as_less_comparator());
        for (auto&& e : m) {
            map.emplace(type->get_keys_type()->decompose(e.first),
                        type->get_values_type()->decompose(e.second));
        }
-        return { std::move(map) };
+        return maps::value { std::move(map) };
+      });
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(e.what());
    }
@@ -233,10 +235,10 @@ maps::delayed_value::bind(const query_options& options) {
        if (key_bytes.is_unset_value()) {
            throw exceptions::invalid_request_exception("unset value is not supported inside collections");
        }
-        if (key_bytes->size() > std::numeric_limits<uint16_t>::max()) {
+        if (key_bytes->size_bytes() > std::numeric_limits<uint16_t>::max()) {
            throw exceptions::invalid_request_exception(sprint("Map key is too long. Map keys are limited to %d bytes but %d bytes keys provided",
                                                   std::numeric_limits<uint16_t>::max(),
-                                                   key_bytes->size()));
+                                                   key_bytes->size_bytes()));
        }
        auto value_bytes = value->bind_and_get(options);
        if (value_bytes.is_null()) {
@@ -331,7 +333,7 @@ maps::do_put(mutation& m, const clustering_key_prefix& prefix, const update_para

        auto ctype = static_pointer_cast<const map_type_impl>(column.type);
        for (auto&& e : map_value->map) {
-            mut.cells.emplace_back(e.first, params.make_cell(*ctype->get_values_type(), e.second, atomic_cell::collection_member::yes));
+            mut.cells.emplace_back(e.first, params.make_cell(*ctype->get_values_type(), fragmented_temporary_buffer::view(e.second), atomic_cell::collection_member::yes));
        }
        auto col_mut = ctype->serialize_mutation_form(std::move(mut));
        m.set_cell(prefix, column, std::move(col_mut));
@@ -342,7 +344,7 @@ maps::do_put(mutation& m, const clustering_key_prefix& prefix, const update_para
        } else {
            auto v = map_type_impl::serialize_partially_deserialized_form({map_value->map.begin(), map_value->map.end()},
                    cql_serialization_format::internal());
-            m.set_cell(prefix, column, params.make_cell(*column.type, std::move(v)));
+            m.set_cell(prefix, column, params.make_cell(*column.type, fragmented_temporary_buffer::view(std::move(v))));
        }
    }
 }
--- a/cql3/maps.hh
+++ b/cql3/maps.hh
@@ -81,7 +81,7 @@ public:
        value(std::map<bytes, bytes, serialized_compare> map)
            : map(std::move(map)) {
        }
-        static value from_serialized(bytes_view value, map_type type, cql_serialization_format sf);
+        static value from_serialized(const fragmented_temporary_buffer::view& value, map_type type, cql_serialization_format sf);
        virtual cql3::raw_value get(const query_options& options) override;
        virtual bytes get_with_protocol_version(cql_serialization_format sf);
        bool equals(map_type mt, const value& v);
--- a/cql3/operation.hh
+++ b/cql3/operation.hh
@@ -92,6 +92,10 @@ public:
    }

    static atomic_cell make_cell(const abstract_type& type, bytes_view value, const update_parameters& params) {
+        return params.make_cell(type, fragmented_temporary_buffer::view(value));
+    }
+
+    static atomic_cell make_cell(const abstract_type& type, const fragmented_temporary_buffer::view& value, const update_parameters& params) {
        return params.make_cell(type, value);
    }

--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -130,84 +130,49 @@ query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<ser

 }

+query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size)
+        : query_options(qo->_consistency,
+        qo->get_timeout_config(),
+        std::move(qo->_names),
+        std::move(qo->_values),
+        std::move(qo->_value_views),
+        qo->_skip_metadata,
+        std::move(query_options::specific_options{page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp}),
+        qo->_cql_serialization_format) {
+
+}
+
 query_options::query_options(std::vector<cql3::raw_value> values)
    : query_options(
          db::consistency_level::ONE, infinite_timeout_config, std::move(values))
 {}

-db::consistency_level query_options::get_consistency() const
-{
-    return _consistency;
-}
-
-cql3::raw_value_view query_options::get_value_at(size_t idx) const
-{
-    return _value_views.at(idx);
-}
-
-size_t query_options::get_values_count() const
-{
-    return _value_views.size();
-}
-
 cql3::raw_value_view query_options::make_temporary(cql3::raw_value value) const
 {
    if (value) {
-        _temporaries.emplace_back(value->begin(), value->end());
-        auto& temporary = _temporaries.back();
-        return cql3::raw_value_view::make_value(bytes_view{temporary.data(), temporary.size()});
+        auto value_view = *value;
+        auto ptr = _temporaries.write_place_holder(value_view.size());
+        std::copy_n(value_view.data(), value_view.size(), ptr);
+        return cql3::raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{ptr, value_view.size()}));
    }
    return cql3::raw_value_view::make_null();
 }

-bool query_options::skip_metadata() const
+bytes_view query_options::linearize(fragmented_temporary_buffer::view view) const
 {
-    return _skip_metadata;
-}
-
-int32_t query_options::get_page_size() const
-{
-    return get_specific_options().page_size;
-}
-
-::shared_ptr<service::pager::paging_state> query_options::get_paging_state() const
-{
-    return get_specific_options().state;
-}
-
-std::experimental::optional<db::consistency_level> query_options::get_serial_consistency() const
-{
-    return get_specific_options().serial_consistency;
-}
-
-api::timestamp_type query_options::get_timestamp(service::query_state& state) const
-{
-    auto tstamp = get_specific_options().timestamp;
-    return tstamp != api::missing_timestamp ? tstamp : state.get_timestamp();
-}
-
-int query_options::get_protocol_version() const
-{
-    return _cql_serialization_format.protocol_version();
-}
-
-cql_serialization_format query_options::get_cql_serialization_format() const
-{
-    return _cql_serialization_format;
-}
-
-const query_options::specific_options& query_options::get_specific_options() const
-{
-    return _options;
-}
-
-const query_options& query_options::for_statement(size_t i) const
-{
-    if (!_batch_options) {
-        // No per-statement options supplied, so use the "global" options
-        return *this;
+    if (view.empty()) {
+        return { };
+    } else if (std::next(view.begin()) == view.end()) {
+        return *view.begin();
+    } else {
+        auto ptr = _temporaries.write_place_holder(view.size_bytes());
+        auto dst = ptr;
+        using boost::range::for_each;
+        for_each(view, [&] (bytes_view bv) {
+            dst = std::copy(bv.begin(), bv.end(), dst);
+        });
+        return bytes_view(ptr, view.size_bytes());
    }
-    return _batch_options->at(i);
 }

 void query_options::prepare(const std::vector<::shared_ptr<column_specification>>& specs)
@@ -234,11 +199,7 @@ void query_options::prepare(const std::vector<::shared_ptr<column_specification>
 void query_options::fill_value_views()
 {
    for (auto&& value : _values) {
-        if (value) {
-            _value_views.emplace_back(cql3::raw_value_view::make_value(bytes_view{*value}));
-        } else {
-            _value_views.emplace_back(cql3::raw_value_view::make_null());
-        }
+        _value_views.emplace_back(value.to_view());
    }
 }

--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -75,7 +75,7 @@ private:
    const std::experimental::optional<std::vector<sstring_view>> _names;
    std::vector<cql3::raw_value> _values;
    std::vector<cql3::raw_value_view> _value_views;
-    mutable std::vector<std::vector<int8_t>> _temporaries;
+    mutable bytes_ostream _temporaries;
    const bool _skip_metadata;
    const specific_options _options;
    cql_serialization_format _cql_serialization_format;
@@ -102,7 +102,7 @@ private:

 public:
    query_options(query_options&&) = default;
-    query_options(const query_options&) = delete;
+    explicit query_options(const query_options&) = default;

    explicit query_options(db::consistency_level consistency,
                           const timeout_config& timeouts,
@@ -155,34 +155,78 @@ public:
    explicit query_options(db::consistency_level, const timeout_config& timeouts,
            std::vector<cql3::raw_value> values, specific_options options = specific_options::DEFAULT);
    explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state);
+    explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size);

-    db::consistency_level get_consistency() const;
    const timeout_config& get_timeout_config() const { return _timeout_config; }
-    cql3::raw_value_view get_value_at(size_t idx) const;
+
+    db::consistency_level get_consistency() const {
+        return _consistency;
+    }
+
+    cql3::raw_value_view get_value_at(size_t idx) const {
+        return _value_views.at(idx);
+    }
+
+    size_t get_values_count() const {
+        return _value_views.size();
+    }
+
    cql3::raw_value_view make_temporary(cql3::raw_value value) const;
-    size_t get_values_count() const;
-    bool skip_metadata() const;
-    /**  The pageSize for this query. Will be <= 0 if not relevant for the query.  */
-    int32_t get_page_size() const;
+    bytes_view linearize(fragmented_temporary_buffer::view) const;
+
+    bool skip_metadata() const {
+        return _skip_metadata;
+    }
+
+    int32_t get_page_size() const {
+        return get_specific_options().page_size;
+    }
+
    /** The paging state for this query, or null if not relevant. */
-    ::shared_ptr<service::pager::paging_state> get_paging_state() const;
+    ::shared_ptr<service::pager::paging_state> get_paging_state() const {
+        return get_specific_options().state;
+    }
+
    /**  Serial consistency for conditional updates. */
-    std::experimental::optional<db::consistency_level> get_serial_consistency() const;
+    std::experimental::optional<db::consistency_level> get_serial_consistency() const {
+        return get_specific_options().serial_consistency;
+    }
+
+    api::timestamp_type get_timestamp(service::query_state& state) const {
+        auto tstamp = get_specific_options().timestamp;
+        return tstamp != api::missing_timestamp ? tstamp : state.get_timestamp();
+    }
+
+    /**
+     * The protocol version for the query. Will be 3 if the object don't come from
+     * a native protocol request (i.e. it's been allocated locally or by CQL-over-thrift).
+     */
+    int get_protocol_version() const {
+        return _cql_serialization_format.protocol_version();
+    }
+
+    cql_serialization_format get_cql_serialization_format() const {
+        return _cql_serialization_format;
+    }
+
+    const query_options::specific_options& get_specific_options() const {
+        return _options;
+    }
+
+    // Mainly for the sake of BatchQueryOptions
+    const query_options& for_statement(size_t i) const {
+        if (!_batch_options) {
+            // No per-statement options supplied, so use the "global" options
+            return *this;
+        }
+        return _batch_options->at(i);
+    }
+

    const std::experimental::optional<std::vector<sstring_view>>& get_names() const noexcept {
        return _names;
    }

-    api::timestamp_type get_timestamp(service::query_state& state) const;
-    /**
-     * The protocol version for the query. Will be 3 if the object don't come from
-     * a native protocol request (i.e. it's been allocated locally or by CQL-over-thrift).
-     */
-    int get_protocol_version() const;
-    cql_serialization_format get_cql_serialization_format() const;
-    // Mainly for the sake of BatchQueryOptions
-    const specific_options& get_specific_options() const;
-    const query_options& for_statement(size_t i) const;
    void prepare(const std::vector<::shared_ptr<column_specification>>& specs);
 private:
    void fill_value_views();
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -206,6 +206,30 @@ query_processor::query_processor(service::storage_proxy& proxy, distributed<data
                            _cql_stats.secondary_index_rows_read,
                            sm::description("Counts a total number of rows read during CQL requests performed using secondary indexes.")),

+                    // read requests that required ALLOW FILTERING
+                    sm::make_derive(
+                            "filtered_read_requests",
+                            _cql_stats.filtered_reads,
+                            sm::description("Counts a total number of CQL read requests that required ALLOW FILTERING. See filtered_rows_read_total to compare how many rows needed to be filtered.")),
+
+                    // rows read with filtering enabled (because ALLOW FILTERING was required)
+                    sm::make_derive(
+                            "filtered_rows_read_total",
+                            _cql_stats.filtered_rows_read_total,
+                            sm::description("Counts a total number of rows read during CQL requests that required ALLOW FILTERING. See filtered_rows_matched_total and filtered_rows_dropped_total for information how accurate filtering queries are.")),
+
+                    // rows read with filtering enabled and accepted by the filter
+                    sm::make_derive(
+                            "filtered_rows_matched_total",
+                            _cql_stats.filtered_rows_matched_total,
+                            sm::description("Counts a number of rows read during CQL requests that required ALLOW FILTERING and accepted by the filter. Number similar to filtered_rows_read_total indicates that filtering is accurate.")),
+
+                    // rows read with filtering enabled and rejected by the filter
+                    sm::make_derive(
+                            "filtered_rows_dropped_total",
+                            [this]() {return _cql_stats.filtered_rows_read_total - _cql_stats.filtered_rows_matched_total;},
+                            sm::description("Counts a number of rows read during CQL requests that required ALLOW FILTERING and dropped by the filter. Number similar to filtered_rows_read_total indicates that filtering is not accurate and might cause performance degradation.")),
+
                    sm::make_derive(
                            "authorized_prepared_statements_cache_evictions",
                            [] { return authorized_prepared_statements_cache::shard_stats().authorized_prepared_statements_cache_evictions; },
@@ -219,7 +243,17 @@ query_processor::query_processor(service::storage_proxy& proxy, distributed<data
                    sm::make_gauge(
                            "user_prepared_auth_cache_footprint",
                            [this] { return _authorized_prepared_cache.memory_footprint(); },
-                            sm::description("Size (in bytes) of the authenticated prepared statements cache."))
+                            sm::description("Size (in bytes) of the authenticated prepared statements cache.")),
+
+                    sm::make_counter(
+                            "reverse_queries",
+                            _cql_stats.reverse_queries,
+                            sm::description("Counts number of CQL SELECT requests with ORDER BY DESC.")),
+
+                    sm::make_counter(
+                            "unpaged_select_queries",
+                            _cql_stats.unpaged_select_queries,
+                            sm::description("Counts number of unpaged CQL SELECT requests.")),

            });

--- a/cql3/restrictions/multi_column_restriction.hh
+++ b/cql3/restrictions/multi_column_restriction.hh
@@ -45,12 +45,16 @@
 #include "cql3/statements/request_validations.hh"
 #include "cql3/restrictions/primary_key_restrictions.hh"
 #include "cql3/statements/request_validations.hh"
+#include "cql3/restrictions/single_column_primary_key_restrictions.hh"

 namespace cql3 {

 namespace restrictions {

 class multi_column_restriction : public primary_key_restrictions<clustering_key_prefix> {
+private:
+    bool _has_only_asc_columns;
+    bool _has_only_desc_columns;
 protected:
    schema_ptr _schema;
    std::vector<const column_definition*> _column_defs;
@@ -58,7 +62,9 @@ public:
    multi_column_restriction(schema_ptr schema, std::vector<const column_definition*>&& defs)
        : _schema(schema)
        , _column_defs(std::move(defs))
-    { }
+    {
+        update_asc_desc_existence();
+    }

    virtual bool is_multi_column() const override {
        return true;
@@ -84,6 +90,7 @@ public:
            "Mixing single column relations and multi column relations on clustering columns is not allowed");
        auto as_pkr = static_pointer_cast<primary_key_restrictions<clustering_key_prefix>>(other);
        do_merge_with(as_pkr);
+        update_asc_desc_existence();
    }

    bool is_satisfied_by(const schema& schema,
@@ -140,6 +147,40 @@ protected:

    virtual bool is_supported_by(const secondary_index::index& index) const = 0;

+    /**
+     * @return true if the restriction contains at least one column of each
+     * ordering, false otherwise.
+     */
+    bool is_mixed_order() const {
+        return !is_desc_order() && !is_asc_order();
+    }
+
+    /**
+     * @return true if all the restricted columns ordered in descending
+     * order, false otherwise
+     */
+    bool is_desc_order() const {
+        return _has_only_desc_columns;
+    }
+
+    /**
+     * @return true if all the restricted columns ordered in ascending
+     * order, false otherwise
+     */
+    bool is_asc_order() const {
+        return _has_only_asc_columns;
+    }
+
+private:
+    /**
+     * Updates the _has_only_asc_columns and _has_only_desc_columns fields.
+     */
+    void update_asc_desc_existence() {
+        std::size_t num_of_desc =
+                std::count_if(_column_defs.begin(), _column_defs.end(),  [] (const column_definition* cd) { return cd->type->is_reversed(); });
+        _has_only_asc_columns = num_of_desc == 0;
+        _has_only_desc_columns = num_of_desc == _column_defs.size();
+    }
 #if 0
    /**
     * Check if this type of restriction is supported for the specified column by the specified index.
@@ -385,6 +426,7 @@ protected:
 };

 class multi_column_restriction::slice final : public multi_column_restriction {
+    using restriction_shared_ptr = ::shared_ptr<primary_key_restrictions<clustering_key_prefix>>;
 private:
    term_slice _slice;

@@ -422,24 +464,11 @@ public:
    }

    virtual std::vector<bounds_range_type> bounds_ranges(const query_options& options) const override {
-        // FIXME: doesn't work properly with mixed CLUSTERING ORDER (CASSANDRA-7281)
-        auto read_bound = [&] (statements::bound b) -> std::experimental::optional<bounds_range_type::bound> {
-            if (!has_bound(b)) {
-                return {};
-            }
-            auto vals = component_bounds(b, options);
-            for (unsigned i = 0; i < vals.size(); i++) {
-                statements::request_validations::check_not_null(vals[i], "Invalid null value in condition for column %s", _column_defs.at(i)->name_as_text());
-            }
-            auto prefix = clustering_key_prefix::from_optional_exploded(*_schema, vals);
-            return bounds_range_type::bound(prefix, is_inclusive(b));
-        };
-        auto range = wrapping_range<clustering_key_prefix>(read_bound(statements::bound::START), read_bound(statements::bound::END));
-        auto bounds = bound_view::from_range(range);
-        if (bound_view::compare(*_schema)(bounds.second, bounds.first)) {
-            return { };
+        if (!is_mixed_order()) {
+            return bounds_ranges_unified_order(options);
+        } else {
+            return bounds_ranges_mixed_order(options);
        }
-        return { bounds_range_type(std::move(range)) };
    }
 #if 0
        @Override
@@ -514,6 +543,221 @@ private:
        auto value = static_pointer_cast<tuples::value>(_slice.bound(b)->bind(options));
        return value->get_elements();
    }
+
+    std::vector<bytes_opt> read_bound_components(const query_options& options, statements::bound b) const {
+        if (!has_bound(b)) {
+            return {};
+        }
+        auto vals = component_bounds(b, options);
+        for (unsigned i = 0; i < vals.size(); i++) {
+            statements::request_validations::check_not_null(vals[i], "Invalid null value in condition for column %s", _column_defs.at(i)->name_as_text());
+        }
+        return vals;
+    }
+
+    /**
+     * Retrieve the bounds for the case that all clustering columns have the same order.
+     * Having the same order implies we can do a prefix search on the data.
+     * @param options the query options
+     * @return the vector of ranges for the restriction
+     */
+    std::vector<bounds_range_type> bounds_ranges_unified_order(const query_options& options) const {
+        auto start_prefix = clustering_key_prefix::from_optional_exploded(*_schema, read_bound_components(options, statements::bound::START));
+        auto start_bound = bounds_range_type::bound(std::move(start_prefix), is_inclusive(statements::bound::START));
+        auto end_prefix = clustering_key_prefix::from_optional_exploded(*_schema, read_bound_components(options, statements::bound::END));
+        auto end_bound = bounds_range_type::bound(std::move(end_prefix), is_inclusive(statements::bound::END));
+        auto make_range = [&] () {
+            if (is_asc_order()) {
+                return bounds_range_type::make(start_bound, end_bound);
+            } else {
+                return bounds_range_type::make(end_bound, start_bound);
+            }
+        };
+        auto range = make_range();
+        auto bounds = bound_view::from_range(range);
+        if (bound_view::compare(*_schema)(bounds.second, bounds.first)) {
+            return { };
+        }
+        return { std::move(range) };
+    }
+
+    /**
+     * Retrieve the bounds when clustering columns are mixed order
+     * (contains ASC and DESC together).
+     * Having mixed order implies that a prefix search can't take place,
+     * instead, the bounds have to be broken down to separate prefix serchable
+     * ranges such that their combination is equivalent to the original range.
+     * @param options the query options
+     * @return the vector of ranges for the restriction
+     */
+    std::vector<bounds_range_type> bounds_ranges_mixed_order(const query_options& options) const {
+        std::vector<bounds_range_type> ret_ranges;
+        auto mixed_order_restrictions = build_mixed_order_restriction_set(options);
+        ret_ranges.reserve(mixed_order_restrictions.size());
+        for (auto r : mixed_order_restrictions) {
+            for (auto&& range : r->bounds_ranges(options)) {
+                ret_ranges.emplace_back(std::move(range));
+            }
+        }
+        return ret_ranges;
+    }
+
+    /**
+     * The function returns the first real inequality component.
+     * The first real inequality is the index of the first component in the
+     * tuple that will turn into a slice single column restriction.
+     * For example: (a, b, c) > (0, 1, 2) and (a, b, c) < (0, 1, 5) will be
+     * broken into one single column restriction set of the form:
+     * a = 0 and b = 1 and c > 2 and c < 5 , c is the first element that has
+     * inequality so for this case the function will return 2.
+     * @param start_components - the components of the starts tuple range.
+     * @param end_components - the components of the end tuple range.
+     * @return an empty value if not found and the index of the first index that
+     * will yield inequality
+     */
+    std::optional<std::size_t> find_first_neq_component(std::vector<bytes_opt>& start_components,
+                                                        std::vector<bytes_opt>& end_components) const {
+        size_t common_components_count = std::min(start_components.size(), end_components.size());
+        for (size_t i = 0; i < common_components_count ; i++) {
+            if (start_components[i].value() != end_components[i].value()) {
+                return i;
+            }
+        }
+
+        size_t max_components_count = std::max(start_components.size(), end_components.size());
+        if (common_components_count < max_components_count) {
+            return common_components_count;
+        } else {
+            return std::nullopt;
+        }
+    }
+
+    /**
+     * Creates a single column restriction which is either slice or equality.
+     * @param bound - if bound is empty this is an equality, if its either START or END ,
+     *        this is the corresponding slice restriction.
+     * @param inclusive - is the slice inclusive (ignored for equality).
+     * @param column_pos - the column position to restrict
+     * @param value - the value to restrict the colum with.
+     * @return a shared pointer to the just created restriction.
+     */
+    ::shared_ptr<restriction> make_single_column_restriction(std::optional<cql3::statements::bound> bound, bool inclusive,
+                                                             std::size_t column_pos,const bytes_opt& value) const {
+        ::shared_ptr<cql3::term> term = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(value));
+        if (!bound){
+            return ::make_shared<cql3::restrictions::single_column_restriction::EQ>(*_column_defs[column_pos], term);
+        } else {
+            return ::make_shared<cql3::restrictions::single_column_restriction::slice>(*_column_defs[column_pos], bound.value(), inclusive, term);
+        }
+    }
+
+    /**
+     * A helper function to create a single column restrictions set from a tuple relation on
+     * clustering keys.
+     * i.e : (a,b,c) >= (0,1,2) will become:
+     *      1.a > 0
+     *      2. a = 0 and b > 1
+     *      3. a = 0 and b = 1 and c >=2
+     * @param bound - determines if the operator is '>' (START) or '<' (END)
+     * @param bound_inclusive - determines if to append equality to the operator i.e: if > becomes >=
+     * @param bound_values - the tuple values for the restriction
+     * @param first_neq_component - the first component that will have inequality.
+     *        for the example above, if this parameter is 1, only restrictions 2 and 3 will be created.
+     *        this parameter helps to facilitate the nuances of breaking more complex relations, for example when
+     *        there is in existence a second condition limiting the other side of the bound
+     *        i.e:(a,b,c) >= (0,1,2)  and (a,b,c) < (5,6,7), this will require each bound to use the parameter.
+     * @return the single column restriction set built according to the above parameters.
+     */
+    std::vector<restriction_shared_ptr> make_single_bound_restrictions(statements::bound bound, bool bound_inclusive,
+                                                                       std::vector<bytes_opt>& bound_values,
+                                                                       std::size_t first_neq_component) const{
+        std::vector<restriction_shared_ptr> ret;
+        std::size_t num_of_restrictions = bound_values.size() - first_neq_component;
+        ret.reserve(num_of_restrictions);
+        for (std::size_t i = 0;i < num_of_restrictions ; i++) {
+            ret.emplace_back(::make_shared<cql3::restrictions::single_column_primary_key_restrictions<clustering_key>>(_schema, false));
+            std::size_t neq_component_idx = first_neq_component + i;
+            for (std::size_t j = 0;j < neq_component_idx; j++) {
+                ret[i]->merge_with(make_single_column_restriction(std::nullopt, false, j, bound_values[j]));
+            }
+            bool inclusive = (i == (num_of_restrictions-1)) && bound_inclusive;
+            ret[i]->merge_with(make_single_column_restriction(bound, inclusive, neq_component_idx, bound_values[neq_component_idx]));
+        }
+        return ret;
+    }
+
+    /**
+     * Builds and returns a set of restrictions such that the union of their ranges (the restrictions OR-ed together)
+     * is logically identical to this restriction, with the additional property that it can execute
+     * correctly when the clustering columns are with "mixed order" - contains ASC and DESC orderings.
+     * for more information: https://github.com/scylladb/scylla/issues/2050
+     * @param options - the query options
+     * @return set of restrictions which their ranges union is logically identical to this restriction.
+     */
+    std::vector<::shared_ptr<primary_key_restrictions<clustering_key_prefix>>>
+    build_mixed_order_restriction_set(const query_options& options) const {
+        std::vector<restriction_shared_ptr> ret;
+        auto start_components = read_bound_components(options, statements::bound::START);
+        auto end_components = read_bound_components(options, statements::bound::END);
+        bool start_inclusive = is_inclusive(statements::bound::START);
+        bool end_inclusive = is_inclusive(statements::bound::END);
+        std::optional<std::size_t> first_neq_component = std::nullopt;
+
+        // find the first index of the first component that is not equal between the tuples.
+        if (start_components.empty() || end_components.empty()) {
+            first_neq_component = 0;
+        } else {
+            auto tuple_mismatch = std::mismatch(start_components.begin(), start_components.end(),
+                    end_components.begin(), end_components.end());
+            if ((tuple_mismatch.first != start_components.end()) ||
+                (tuple_mismatch.second != end_components.end())) {
+                first_neq_component = std::distance(start_components.begin(), tuple_mismatch.first);
+            }
+        }
+
+        // this is either a simple equality or a never fulfilled restriction
+        if (!first_neq_component && start_inclusive && end_inclusive) {
+            // This is a simple equality case
+            shared_ptr<cql3::term> term = ::make_shared<cql3::tuples::value>(start_components);
+            ret.emplace_back(::make_shared<cql3::restrictions::multi_column_restriction::EQ>(_schema, _column_defs, term));
+            return ret;
+        } else if (!first_neq_component) {
+            // This is a contradiction case
+            return {};
+        } else if ((*first_neq_component == end_components.size() && !end_inclusive ) ||
+                   (*first_neq_component == start_components.size() && !start_inclusive )) {
+            // This is a case where one bound is a prefix of the other. If this prefix bound
+            // is not inclusive the result will be an empty set.
+            return {};
+        }
+
+        bool start_components_exists = (start_components.size() - first_neq_component.value()) > 0;
+        bool end_components_exists = (end_components.size() - first_neq_component.value()) > 0;
+        bool both_components_exists = start_components_exists && end_components_exists;
+        if (start_components_exists) {
+            auto restrictions =
+                    make_single_bound_restrictions(statements::bound::START, start_inclusive, start_components, first_neq_component.value());
+            for (auto&& r : restrictions) {
+                ret.emplace_back(r);
+            }
+        }
+
+        if (end_components_exists) {
+            auto restrictions =
+                    make_single_bound_restrictions(statements::bound::END, end_inclusive,
+                            end_components, first_neq_component.value() + both_components_exists);
+            for (auto&& r : restrictions) {
+                ret.emplace_back(r);
+            }
+        }
+
+        if (both_components_exists) {
+            bool inclusive = end_inclusive && ((end_components.size() - first_neq_component.value()) == 1);
+            ret[0]->merge_with(make_single_column_restriction(statements::bound::END, inclusive, first_neq_component.value(),
+                    end_components[first_neq_component.value()]));
+        }
+        return ret;
+    }
 };

 }
--- a/cql3/restrictions/primary_key_restrictions.hh
+++ b/cql3/restrictions/primary_key_restrictions.hh
@@ -88,6 +88,7 @@ public:

    using restrictions::uses_function;
    using restrictions::has_supporting_index;
+    using restrictions::values;

    bool empty() const override {
        return get_column_defs().empty();
@@ -95,7 +96,72 @@ public:
    uint32_t size() const override {
        return uint32_t(get_column_defs().size());
    }
+
+    bool has_unrestricted_components(const schema& schema) const;
+
+    virtual bool needs_filtering(const schema& schema) const;
+
+    // How long a prefix of the restrictions could have resulted in
+    // need_filtering() == false. These restrictions do not need to be
+    // applied during filtering.
+    // For example, if we have the filter "c1 < 3 and c2 > 3", c1 does
+    // not need filtering (just a read stopping at c1=3) but c2 does,
+    // so num_prefix_columns_that_need_not_be_filtered() will be 1.
+    virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const {
+        return 0;
+    }
+
+    virtual bool is_all_eq() const {
+        return false;
+    }
+    virtual size_t prefix_size() const {
+        return 0;
+    }
+
+    size_t prefix_size(const schema_ptr schema) const {
+        return 0;
+    }
+
 };

+template<>
+inline bool primary_key_restrictions<partition_key>::has_unrestricted_components(const schema& schema) const {
+    return size() < schema.partition_key_size();
+}
+
+template<>
+inline bool primary_key_restrictions<clustering_key>::has_unrestricted_components(const schema& schema) const {
+    return size() < schema.clustering_key_size();
+}
+
+template<>
+inline bool primary_key_restrictions<partition_key>::needs_filtering(const schema& schema) const  {
+    return !empty() && !is_on_token() && (has_unrestricted_components(schema) || is_contains() || is_slice());
+}
+
+template<>
+inline bool primary_key_restrictions<clustering_key>::needs_filtering(const schema& schema) const  {
+    // Currently only overloaded single_column_primary_key_restrictions will require ALLOW FILTERING
+    return false;
+}
+
+template<>
+inline size_t primary_key_restrictions<clustering_key>::prefix_size(const schema_ptr schema) const {
+    size_t count = 0;
+    if (schema->clustering_key_columns().empty()) {
+        return count;
+    }
+    auto column_defs = get_column_defs();
+    column_id expected_column_id = schema->clustering_key_columns().begin()->id;
+    for (auto&& cdef : column_defs) {
+        if (schema->position(*cdef) != expected_column_id) {
+            return count;
+        }
+        expected_column_id++;
+        count++;
+    }
+    return count;
+}
+
 }
 }
--- a/cql3/restrictions/restrictions.hh
+++ b/cql3/restrictions/restrictions.hh
@@ -68,6 +68,10 @@ public:

    virtual std::vector<bytes_opt> values(const query_options& options) const = 0;

+    virtual bytes_opt value_for(const column_definition& cdef, const query_options& options) const {
+        throw exceptions::invalid_request_exception("Single value can be obtained from single-column restrictions only");
+    }
+
    /**
     * Returns <code>true</code> if one of the restrictions use the specified function.
     *
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -49,6 +49,7 @@
 #include <boost/algorithm/cxx11/all_of.hpp>
 #include <boost/range/adaptor/transformed.hpp>
 #include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/map.hpp>

 namespace cql3 {

@@ -62,6 +63,8 @@ class single_column_primary_key_restrictions : public primary_key_restrictions<V
    using range_type = query::range<ValueType>;
    using range_bound = typename range_type::bound;
    using bounds_range_type = typename primary_key_restrictions<ValueType>::bounds_range_type;
+    template<typename OtherValueType>
+    friend class single_column_primary_key_restrictions;
 private:
    schema_ptr _schema;
    bool _allow_filtering;
@@ -79,6 +82,27 @@ public:
        , _in(false)
    { }

+    // Convert another primary key restrictions type into this type, possibly using different schema
+    template<typename OtherValueType>
+    explicit single_column_primary_key_restrictions(schema_ptr schema, const single_column_primary_key_restrictions<OtherValueType>& other)
+        : _schema(schema)
+        , _allow_filtering(other._allow_filtering)
+        , _restrictions(::make_shared<single_column_restrictions>(schema))
+        , _slice(other._slice)
+        , _contains(other._contains)
+        , _in(other._in)
+    {
+        for (const auto& entry : other._restrictions->restrictions()) {
+            const column_definition* other_cdef = entry.first;
+            const column_definition* this_cdef = _schema->get_column_definition(other_cdef->name());
+            if (!this_cdef) {
+                throw exceptions::invalid_request_exception(sprint("Base column %s not found in view index schema", other_cdef->name_as_text()));
+            }
+            ::shared_ptr<single_column_restriction> restriction = entry.second;
+            _restrictions->add_restriction(restriction->apply_to(*this_cdef));
+        }
+    }
+
    virtual bool is_on_token() const override {
        return false;
    }
@@ -99,6 +123,10 @@ public:
        return _in;
    }

+    virtual bool is_all_eq() const override {
+        return _restrictions->is_all_eq();
+    }
+
    virtual bool has_bound(statements::bound b) const override {
        return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->has_bound(b); });
    }
@@ -137,6 +165,25 @@ public:
        _restrictions->add_restriction(restriction);
    }

+    virtual size_t prefix_size() const override {
+        return primary_key_restrictions<ValueType>::prefix_size(_schema);
+    }
+
+    ::shared_ptr<single_column_primary_key_restrictions<clustering_key>> get_longest_prefix_restrictions() {
+        static_assert(std::is_same_v<ValueType, clustering_key>, "Only clustering key can produce longest prefix restrictions");
+        size_t current_prefix_size = prefix_size();
+        if (current_prefix_size == _restrictions->restrictions().size()) {
+            return dynamic_pointer_cast<single_column_primary_key_restrictions<clustering_key>>(this->shared_from_this());
+        }
+
+        auto longest_prefix_restrictions = ::make_shared<single_column_primary_key_restrictions<clustering_key>>(_schema, _allow_filtering);
+        auto restriction_it = _restrictions->restrictions().begin();
+        for (size_t i = 0; i < current_prefix_size; ++i) {
+            longest_prefix_restrictions->merge_with((restriction_it++)->second);
+        }
+        return longest_prefix_restrictions;
+    }
+
    virtual void merge_with(::shared_ptr<restriction> restriction) override {
        if (restriction->is_multi_column()) {
            throw exceptions::invalid_request_exception(
@@ -309,11 +356,20 @@ public:
        }
        return res;
    }
+
+    virtual bytes_opt value_for(const column_definition& cdef, const query_options& options) const override {
+        return _restrictions->value_for(cdef, options);
+    }
+
    std::vector<bytes_opt> bounds(statements::bound b, const query_options& options) const override {
        // TODO: if this proved to be required.
        fail(unimplemented::cause::LEGACY_COMPOSITE_KEYS); // not 100% correct...
    }

+    const single_column_restrictions::restrictions_map& restrictions() const {
+        return _restrictions->restrictions();
+    }
+
    virtual bool has_supporting_index(const secondary_index::secondary_index_manager& index_manager) const override {
        return _restrictions->has_supporting_index(index_manager);
    }
@@ -349,10 +405,13 @@ public:
            _restrictions->restrictions() | boost::adaptors::map_values,
            [&] (auto&& r) { return r->is_satisfied_by(schema, key, ckey, cells, options, now); });
    }
+
+    virtual bool needs_filtering(const schema& schema) const override;
+    virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const override;
 };

 template<>
-dht::partition_range_vector
+inline dht::partition_range_vector
 single_column_primary_key_restrictions<partition_key>::bounds_ranges(const query_options& options) const {
    dht::partition_range_vector ranges;
    ranges.reserve(size());
@@ -370,7 +429,7 @@ single_column_primary_key_restrictions<partition_key>::bounds_ranges(const query
 }

 template<>
-std::vector<query::clustering_range>
+inline std::vector<query::clustering_range>
 single_column_primary_key_restrictions<clustering_key_prefix>::bounds_ranges(const query_options& options) const {
    auto wrapping_bounds = compute_bounds(options);
    auto bounds = boost::copy_range<query::clustering_row_ranges>(wrapping_bounds
@@ -406,6 +465,62 @@ single_column_primary_key_restrictions<clustering_key_prefix>::bounds_ranges(con
    return bounds;
 }

+template<>
+inline bool single_column_primary_key_restrictions<partition_key>::needs_filtering(const schema& schema) const {
+    return primary_key_restrictions<partition_key>::needs_filtering(schema);
+}
+
+template<>
+inline bool single_column_primary_key_restrictions<clustering_key>::needs_filtering(const schema& schema) const {
+    // Restrictions currently need filtering in three cases:
+    // 1. any of them is a CONTAINS restriction
+    // 2. restrictions do not form a contiguous prefix (i.e. there are gaps in it)
+    // 3. a SLICE restriction isn't on a last place
+    column_id position = 0;
+    for (const auto& restriction : _restrictions->restrictions() | boost::adaptors::map_values) {
+        if (restriction->is_contains() || position != restriction->get_column_def().id) {
+            return true;
+        }
+        if (!restriction->is_slice()) {
+            position = restriction->get_column_def().id + 1;
+        }
+    }
+    return false;
+}
+
+// How many of the restrictions (in column order) do not need filtering
+// because they are implemented as a slice (potentially, a contiguous disk
+// read). For example, if we have the filter "c1 < 3 and c2 > 3", c1 does not
+// need filtering but c2 does so num_prefix_columns_that_need_not_be_filtered
+// will be 1.
+// The implementation of num_prefix_columns_that_need_not_be_filtered() is
+// closely tied to that of needs_filtering() above - basically, if only the
+// first num_prefix_columns_that_need_not_be_filtered() restrictions existed,
+// then needs_filtering() would have returned false.
+template<>
+inline unsigned single_column_primary_key_restrictions<clustering_key>::num_prefix_columns_that_need_not_be_filtered() const {
+    column_id position = 0;
+    unsigned int count = 0;
+    for (const auto& restriction : _restrictions->restrictions() | boost::adaptors::map_values) {
+        if (restriction->is_contains() || position != restriction->get_column_def().id) {
+            return count;
+        }
+        if (!restriction->is_slice()) {
+            position = restriction->get_column_def().id + 1;
+        }
+        count++;
+    }
+    return count;
+}
+
+template<>
+inline unsigned single_column_primary_key_restrictions<partition_key>::num_prefix_columns_that_need_not_be_filtered() const {
+    // skip_filtering() is currently called only for clustering key
+    // restrictions, so it doesn't matter what we return here.
+    return 0;
+}
+
+
 }
 }

--- a/cql3/restrictions/single_column_restriction.hh
+++ b/cql3/restrictions/single_column_restriction.hh
@@ -93,6 +93,9 @@ public:
    }

    virtual bool is_supported_by(const secondary_index::index& index) const = 0;
+    using abstract_restriction::is_satisfied_by;
+    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const = 0;
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) = 0;
 #if 0
    /**
     * Check if this type of restriction is supported by the specified index.
@@ -166,6 +169,10 @@ public:
                                 const row& cells,
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
+    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        return ::make_shared<EQ>(cdef, _value);
+    }

 #if 0
        @Override
@@ -201,6 +208,10 @@ public:
                                 const row& cells,
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
+    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        throw std::logic_error("IN superclass should never be cloned directly");
+    }

    virtual std::vector<bytes_opt> values_raw(const query_options& options) const = 0;

@@ -243,6 +254,10 @@ public:
    virtual sstring to_string() const override {
        return sprint("IN(%s)", std::to_string(_values));
    }
+
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        return ::make_shared<IN_with_values>(cdef, _values);
+    }
 };

 class single_column_restriction::IN_with_marker : public IN {
@@ -268,6 +283,10 @@ public:
    virtual sstring to_string() const override {
        return "IN ?";
    }
+
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        return ::make_shared<IN_with_marker>(cdef, _marker);
+    }
 };

 class single_column_restriction::slice : public single_column_restriction {
@@ -279,6 +298,11 @@ public:
        , _slice(term_slice::new_instance(bound, inclusive, std::move(term)))
    { }

+    slice(const column_definition& column_def, term_slice slice)
+        : single_column_restriction(column_def)
+        , _slice(slice)
+    { }
+
    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
        return (_slice.has_bound(statements::bound::START) && abstract_restriction::term_uses_function(_slice.bound(statements::bound::START), ks_name, function_name))
                || (_slice.has_bound(statements::bound::END) && abstract_restriction::term_uses_function(_slice.bound(statements::bound::END), ks_name, function_name));
@@ -364,6 +388,10 @@ public:
                                 const row& cells,
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
+    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        return ::make_shared<slice>(cdef, _slice);
+    }
 };

 // This holds CONTAINS, CONTAINS_KEY, and map[key] = value restrictions because we might want to have any combination of them.
@@ -485,6 +513,10 @@ public:
                                 const row& cells,
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
+    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        throw std::logic_error("Cloning 'contains' restriction is not implemented.");
+    }

 #if 0
        private List<ByteBuffer> keys(const query_options& options) {
--- a/cql3/restrictions/single_column_restrictions.hh
+++ b/cql3/restrictions/single_column_restrictions.hh
@@ -111,6 +111,11 @@ public:
        return r;
    }

+    virtual bytes_opt value_for(const column_definition& cdef, const query_options& options) const override {
+        auto it = _restrictions.find(std::addressof(cdef));
+        return (it != _restrictions.end()) ? it->second->value(options) : bytes_opt{};
+    }
+
    /**
     * Returns the restriction associated to the specified column.
     *
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -23,6 +23,7 @@
 #include <boost/range/algorithm/transform.hpp>
 #include <boost/range/algorithm.hpp>
 #include <boost/range/adaptors.hpp>
+#include <boost/algorithm/cxx11/any_of.hpp>

 #include "statement_restrictions.hh"
 #include "single_column_primary_key_restrictions.hh"
@@ -36,6 +37,8 @@
 namespace cql3 {
 namespace restrictions {

+static logging::logger rlogger("restrictions");
+
 using boost::adaptors::filtered;
 using boost::adaptors::transformed;

@@ -69,6 +72,9 @@ public:
        // throw? should not reach?
        return {};
    }
+    bytes_opt value_for(const column_definition& cdef, const query_options& options) const override {
+        return {};
+    }
    std::vector<T> values_as_keys(const query_options& options) const override {
        // throw? should not reach?
        return {};
@@ -202,23 +208,22 @@ statement_restrictions::statement_restrictions(database& db,
                    throw exceptions::invalid_request_exception(sprint("restriction '%s' is only supported in materialized view creation", relation->to_string()));
                }
            } else {
-                add_restriction(relation->to_restriction(db, schema, bound_names));
+                add_restriction(relation->to_restriction(db, schema, bound_names), for_view, allow_filtering);
            }
        }
    }
    auto& cf = db.find_column_family(schema);
    auto& sim = cf.get_index_manager();
-    bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim);
-    bool has_queriable_index = has_queriable_clustering_column_index
-            || _partition_key_restrictions->has_supporting_index(sim)
-            || _nonprimary_key_restrictions->has_supporting_index(sim);
+    const bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim);
+    const bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim);
+    const bool has_queriable_regular_index = _nonprimary_key_restrictions->has_supporting_index(sim);

    // At this point, the select statement if fully constructed, but we still have a few things to validate
-    process_partition_key_restrictions(has_queriable_index, for_view);
+    process_partition_key_restrictions(has_queriable_pk_index, for_view, allow_filtering);

    // Some but not all of the partition key columns have been specified;
    // hence we need turn these restrictions into index expressions.
-    if (_uses_secondary_indexing) {
+    if (_uses_secondary_indexing || _partition_key_restrictions->needs_filtering(*_schema)) {
        _index_restrictions.push_back(_partition_key_restrictions);
    }

@@ -234,13 +239,14 @@ statement_restrictions::statement_restrictions(database& db,
        }
    }

-    process_clustering_columns_restrictions(has_queriable_index, select_a_collection, for_view);
+    process_clustering_columns_restrictions(has_queriable_clustering_column_index, select_a_collection, for_view, allow_filtering);

    // Covers indexes on the first clustering column (among others).
-    if (_is_key_range && has_queriable_clustering_column_index)
-    _uses_secondary_indexing = true;
+    if (_is_key_range && has_queriable_clustering_column_index) {
+        _uses_secondary_indexing = true;
+    }

-    if (_uses_secondary_indexing) {
+    if (_uses_secondary_indexing || _clustering_columns_restrictions->needs_filtering(*_schema)) {
        _index_restrictions.push_back(_clustering_columns_restrictions);
    } else if (_clustering_columns_restrictions->is_contains()) {
        fail(unimplemented::cause::INDEXES);
@@ -269,31 +275,48 @@ statement_restrictions::statement_restrictions(database& db,
        uses_secondary_indexing = true;
 #endif
    }
-    // Even if uses_secondary_indexing is false at this point, we'll still have to use one if
-    // there is restrictions not covered by the PK.
+
    if (!_nonprimary_key_restrictions->empty()) {
-        _uses_secondary_indexing = true;
+        if (has_queriable_regular_index) {
+            _uses_secondary_indexing = true;
+        } else if (!allow_filtering) {
+            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
+                "thus may have unpredictable performance. If you want to execute "
+                "this query despite the performance unpredictability, use ALLOW FILTERING");
+        }
        _index_restrictions.push_back(_nonprimary_key_restrictions);
    }

-    if (_uses_secondary_indexing && !for_view) {
+    if (_uses_secondary_indexing && !(for_view || allow_filtering)) {
        validate_secondary_index_selections(selects_only_static_columns);
    }
 }

-void statement_restrictions::add_restriction(::shared_ptr<restriction> restriction) {
+void statement_restrictions::add_restriction(::shared_ptr<restriction> restriction, bool for_view, bool allow_filtering) {
    if (restriction->is_multi_column()) {
        _clustering_columns_restrictions = _clustering_columns_restrictions->merge_to(_schema, restriction);
    } else if (restriction->is_on_token()) {
        _partition_key_restrictions = _partition_key_restrictions->merge_to(_schema, restriction);
    } else {
-        add_single_column_restriction(::static_pointer_cast<single_column_restriction>(restriction));
+        add_single_column_restriction(::static_pointer_cast<single_column_restriction>(restriction), for_view, allow_filtering);
    }
 }

-void statement_restrictions::add_single_column_restriction(::shared_ptr<single_column_restriction> restriction) {
+void statement_restrictions::add_single_column_restriction(::shared_ptr<single_column_restriction> restriction, bool for_view, bool allow_filtering) {
    auto& def = restriction->get_column_def();
    if (def.is_partition_key()) {
+        // A SELECT query may not request a slice (range) of partition keys
+        // without using token(). This is because there is no way to do this
+        // query efficiently: mumur3 turns a contiguous range of partition
+        // keys into tokens all over the token space.
+        // However, in a SELECT statement used to define a materialized view,
+        // such a slice is fine - it is used to check whether individual
+        // partitions, match, and does not present a performance problem.
+        assert(!restriction->is_on_token());
+        if (restriction->is_slice() && !for_view && !allow_filtering) {
+            throw exceptions::invalid_request_exception(
+                    "Only EQ and IN relation are supported on the partition key (unless you use the token() function or allow filtering)");
+        }
        _partition_key_restrictions = _partition_key_restrictions->merge_to(_schema, restriction);
    } else if (def.is_clustering_key()) {
        _clustering_columns_restrictions = _clustering_columns_restrictions->merge_to(_schema, restriction);
@@ -312,7 +335,54 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
    return _index_restrictions;
 }

-void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view) {
+std::optional<secondary_index::index> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
+    for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
+        for (const auto& cdef : restriction->get_column_defs()) {
+            for (auto index : sim.list_indexes()) {
+                if (index.depends_on(*cdef)) {
+                    return std::make_optional<secondary_index::index>(std::move(index));
+                }
+            }
+        }
+    }
+    return std::nullopt;
+}
+
+std::vector<const column_definition*> statement_restrictions::get_column_defs_for_filtering(database& db) const {
+    std::vector<const column_definition*> column_defs_for_filtering;
+    if (need_filtering()) {
+        auto& sim = db.find_column_family(_schema).get_index_manager();
+        std::optional<secondary_index::index> opt_idx = find_idx(sim);
+        auto column_uses_indexing = [&opt_idx] (const column_definition* cdef) {
+            return opt_idx && opt_idx->depends_on(*cdef);
+        };
+        if (_partition_key_restrictions->needs_filtering(*_schema)) {
+            for (auto&& cdef : _partition_key_restrictions->get_column_defs()) {
+                if (!column_uses_indexing(cdef)) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        const bool pk_has_unrestricted_components = _partition_key_restrictions->has_unrestricted_components(*_schema);
+        if (pk_has_unrestricted_components || _clustering_columns_restrictions->needs_filtering(*_schema)) {
+            column_id first_filtering_id = pk_has_unrestricted_components ? 0 : _schema->clustering_key_columns().begin()->id +
+                    _clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
+            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
+                if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef)) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        for (auto&& cdef : _nonprimary_key_restrictions->get_column_defs()) {
+            if (!column_uses_indexing(cdef)) {
+                column_defs_for_filtering.emplace_back(cdef);
+            }
+        }
+    }
+    return column_defs_for_filtering;
+}
+
+void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
    // If there is a queriable index, no special condition are required on the other restrictions.
    // But we still need to know 2 things:
    // - If we don't have a queriable index, is the query ok
@@ -321,39 +391,32 @@ void statement_restrictions::process_partition_key_restrictions(bool has_queriab
    // components must have a EQ. Only the last partition key component can be in IN relation.
    if (_partition_key_restrictions->is_on_token()) {
        _is_key_range = true;
-    } else if (has_partition_key_unrestricted_components()) {
-        if (!_partition_key_restrictions->empty() && !for_view) {
-            if (!has_queriable_index) {
-                throw exceptions::invalid_request_exception(sprint("Partition key parts: %s must be restricted as other parts are",
-                    join(", ", get_partition_key_unrestricted_components())));
-            }
-        }
-
+    } else if (_partition_key_restrictions->has_unrestricted_components(*_schema)) {
        _is_key_range = true;
        _uses_secondary_indexing = has_queriable_index;
    }
-    if (_partition_key_restrictions->is_slice() && !_partition_key_restrictions->is_on_token() && !for_view) {
-        // A SELECT query may not request a slice (range) of partition keys
-        // without using token(). This is because there is no way to do this
-        // query efficiently: mumur3 turns a contiguous range of partition
-        // keys into tokens all over the token space.
-        // However, in a SELECT statement used to define a materialized view,
-        // such a slice is fine - it is used to check whether individual
-        // partitions, match, and does not present a performance problem.
-        throw exceptions::invalid_request_exception(
-                "Only EQ and IN relation are supported on the partition key (unless you use the token() function)");
+
+    if (_partition_key_restrictions->needs_filtering(*_schema)) {
+        if (!allow_filtering && !for_view && !has_queriable_index) {
+            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
+                "thus may have unpredictable performance. If you want to execute "
+                "this query despite the performance unpredictability, use ALLOW FILTERING");
+        }
+        _is_key_range = true;
+        _uses_secondary_indexing = has_queriable_index;
    }
+
 }

 bool statement_restrictions::has_partition_key_unrestricted_components() const {
-    return _partition_key_restrictions->size() < _schema->partition_key_size();
+    return _partition_key_restrictions->has_unrestricted_components(*_schema);
 }

 bool statement_restrictions::has_unrestricted_clustering_columns() const {
-    return _clustering_columns_restrictions->size() < _schema->clustering_key_size();
+    return _clustering_columns_restrictions->has_unrestricted_components(*_schema);
 }

-void statement_restrictions::process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view) {
+void statement_restrictions::process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view, bool allow_filtering) {
    if (!has_clustering_columns_restriction()) {
        return;
    }
@@ -362,38 +425,36 @@ void statement_restrictions::process_clustering_columns_restrictions(bool has_qu
        throw exceptions::invalid_request_exception(
            "Cannot restrict clustering columns by IN relations when a collection is selected by the query");
    }
-    if (_clustering_columns_restrictions->is_contains() && !has_queriable_index) {
+    if (_clustering_columns_restrictions->is_contains() && !has_queriable_index && !allow_filtering) {
        throw exceptions::invalid_request_exception(
-            "Cannot restrict clustering columns by a CONTAINS relation without a secondary index");
+            "Cannot restrict clustering columns by a CONTAINS relation without a secondary index or filtering");
    }

-    auto clustering_columns_iter = _schema->clustering_key_columns().begin();
-
-    for (auto&& restricted_column : _clustering_columns_restrictions->get_column_defs()) {
-        const column_definition* clustering_column = &(*clustering_columns_iter);
-        ++clustering_columns_iter;
-
-        if (clustering_column != restricted_column && !for_view) {
-            if (!has_queriable_index) {
-                throw exceptions::invalid_request_exception(sprint(
-                    "PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
-                    restricted_column->name_as_text(), clustering_column->name_as_text()));
+    if (has_clustering_columns_restriction() && _clustering_columns_restrictions->needs_filtering(*_schema)) {
+        if (has_queriable_index) {
+            _uses_secondary_indexing = true;
+        } else if (!allow_filtering && !for_view) {
+            auto clustering_columns_iter = _schema->clustering_key_columns().begin();
+            for (auto&& restricted_column : _clustering_columns_restrictions->get_column_defs()) {
+                const column_definition* clustering_column = &(*clustering_columns_iter);
+                ++clustering_columns_iter;
+                if (clustering_column != restricted_column) {
+                        throw exceptions::invalid_request_exception(sprint(
+                            "PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
+                            restricted_column->name_as_text(), clustering_column->name_as_text()));
+                }
            }
-
-            _uses_secondary_indexing = true; // handle gaps and non-keyrange cases.
-            break;
        }
    }
-
-    if (_clustering_columns_restrictions->is_contains()) {
-        _uses_secondary_indexing = true;
-    }
 }

 dht::partition_range_vector statement_restrictions::get_partition_key_ranges(const query_options& options) const {
    if (_partition_key_restrictions->empty()) {
        return {dht::partition_range::make_open_ended_both_sides()};
    }
+    if (_partition_key_restrictions->needs_filtering(*_schema)) {
+        return {dht::partition_range::make_open_ended_both_sides()};
+    }
    return _partition_key_restrictions->bounds_ranges(options);
 }

@@ -401,18 +462,40 @@ std::vector<query::clustering_range> statement_restrictions::get_clustering_boun
    if (_clustering_columns_restrictions->empty()) {
        return {query::clustering_range::make_open_ended_both_sides()};
    }
+    if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
+        if (auto single_ck_restrictions = dynamic_pointer_cast<single_column_primary_key_restrictions<clustering_key>>(_clustering_columns_restrictions)) {
+            return single_ck_restrictions->get_longest_prefix_restrictions()->bounds_ranges(options);
+        }
+        return {query::clustering_range::make_open_ended_both_sides()};
+    }
    return _clustering_columns_restrictions->bounds_ranges(options);
 }

-bool statement_restrictions::need_filtering() {
-    uint32_t number_of_restricted_columns = 0;
+bool statement_restrictions::need_filtering() const {
+    uint32_t number_of_restricted_columns_for_indexing = 0;
    for (auto&& restrictions : _index_restrictions) {
-        number_of_restricted_columns += restrictions->size();
+        number_of_restricted_columns_for_indexing += restrictions->size();
    }

-    return number_of_restricted_columns > 1
-           || (number_of_restricted_columns == 0 && has_clustering_columns_restriction())
-           || (number_of_restricted_columns != 0 && _nonprimary_key_restrictions->has_multiple_contains());
+    int number_of_filtering_restrictions = _nonprimary_key_restrictions->size();
+    // If the whole partition key is restricted, it does not imply filtering
+    if (_partition_key_restrictions->has_unrestricted_components(*_schema) || !_partition_key_restrictions->is_all_eq()) {
+        number_of_filtering_restrictions += _partition_key_restrictions->size() + _clustering_columns_restrictions->size();
+    } else if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
+        number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
+    }
+
+    if (_partition_key_restrictions->is_multi_column() || _clustering_columns_restrictions->is_multi_column()) {
+        // TODO(sarna): Implement ALLOW FILTERING support for multi-column restrictions - return false for now
+        // in order to ensure backwards compatibility
+        return false;
+    }
+
+    return number_of_restricted_columns_for_indexing > 1
+            || (number_of_restricted_columns_for_indexing == 0 && _partition_key_restrictions->empty() && !_clustering_columns_restrictions->empty())
+            || (number_of_restricted_columns_for_indexing != 0 && _nonprimary_key_restrictions->has_multiple_contains())
+            || (number_of_restricted_columns_for_indexing != 0 && !_uses_secondary_indexing)
+            || (_uses_secondary_indexing && number_of_filtering_restrictions > 1);
 }

 void statement_restrictions::validate_secondary_index_selections(bool selects_only_static_columns) {
@@ -430,6 +513,33 @@ void statement_restrictions::validate_secondary_index_selections(bool selects_on
    }
 }

+const single_column_restrictions::restrictions_map& statement_restrictions::get_single_column_partition_key_restrictions() const {
+    static single_column_restrictions::restrictions_map empty;
+    auto single_restrictions = dynamic_pointer_cast<single_column_primary_key_restrictions<partition_key>>(_partition_key_restrictions);
+    if (!single_restrictions) {
+        if (dynamic_pointer_cast<initial_key_restrictions<partition_key>>(_partition_key_restrictions)) {
+            return empty;
+        }
+        throw std::runtime_error("statement restrictions for multi-column partition key restrictions are not implemented yet");
+    }
+    return single_restrictions->restrictions();
+}
+
+/**
+ * @return clustering key restrictions split into single column restrictions (e.g. for filtering support).
+ */
+const single_column_restrictions::restrictions_map& statement_restrictions::get_single_column_clustering_key_restrictions() const {
+    static single_column_restrictions::restrictions_map empty;
+    auto single_restrictions = dynamic_pointer_cast<single_column_primary_key_restrictions<clustering_key>>(_clustering_columns_restrictions);
+    if (!single_restrictions) {
+        if (dynamic_pointer_cast<initial_key_restrictions<clustering_key>>(_clustering_columns_restrictions)) {
+            return empty;
+        }
+        throw std::runtime_error("statement restrictions for multi-column partition key restrictions are not implemented yet");
+    }
+    return single_restrictions->restrictions();
+}
+
 static std::optional<atomic_cell_value_view> do_get_value(const schema& schema,
        const column_definition& cdef,
        const partition_key& key,
@@ -482,6 +592,14 @@ bool single_column_restriction::EQ::is_satisfied_by(const schema& schema,
    return false;
 }

+bool single_column_restriction::EQ::is_satisfied_by(bytes_view data, const query_options& options) const {
+    if (_column_def.type->is_counter()) {
+        fail(unimplemented::cause::COUNTERS);
+    }
+    auto operand = value(options);
+    return operand && _column_def.type->compare(*operand, data) == 0;
+}
+
 bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
        const partition_key& key,
        const clustering_key_prefix& ckey,
@@ -503,6 +621,16 @@ bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
  });
 }

+bool single_column_restriction::IN::is_satisfied_by(bytes_view data, const query_options& options) const {
+    if (_column_def.type->is_counter()) {
+        fail(unimplemented::cause::COUNTERS);
+    }
+    auto operands = values(options);
+    return boost::algorithm::any_of(operands, [this, &data] (const bytes_opt& operand) {
+        return operand && _column_def.type->compare(*operand, data) == 0;
+    });
+}
+
 static query::range<bytes_view> to_range(const term_slice& slice, const query_options& options) {
    using range_type = query::range<bytes_view>;
    auto extract_bound = [&] (statements::bound bound) -> stdx::optional<range_type::bound> {
@@ -513,7 +641,8 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
        if (!value) {
            return { };
        }
-        return { range_type::bound(*value, slice.is_inclusive(bound)) };
+        auto value_view = options.linearize(*value);
+        return { range_type::bound(value_view, slice.is_inclusive(bound)) };
    };
    return range_type(
        extract_bound(statements::bound::START),
@@ -538,6 +667,13 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
    });
 }

+bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
+    if (_column_def.type->is_counter()) {
+        fail(unimplemented::cause::COUNTERS);
+    }
+    return to_range(_slice, options).contains(data, _column_def.type->underlying_type()->as_tri_comparator());
+}
+
 bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
        const partition_key& key,
        const clustering_key_prefix& ckey,
@@ -571,10 +707,12 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
            if (!val) {
                continue;
            }
-            auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
+            auto found = with_linearized(*val, [&] (bytes_view bv) {
+              return std::find_if(elements.begin(), end, [&] (auto&& element) {
                return element.second.value().with_linearized([&] (bytes_view value_bv) {
-                    return element_type->compare(value_bv, *val) == 0;
+                    return element_type->compare(value_bv, bv) == 0;
                });
+              });
            });
            if (found == end) {
                return false;
@@ -585,8 +723,10 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
            if (!k) {
                continue;
            }
-            auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
-                return map_key_type->compare(element.first, *k) == 0;
+            auto found = with_linearized(*k, [&] (bytes_view bv) {
+              return std::find_if(elements.begin(), end, [&] (auto&& element) {
+                return map_key_type->compare(element.first, bv) == 0;
+              });
            });
            if (found == end) {
                return false;
@@ -598,14 +738,18 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
            if (!map_key || !map_value) {
                continue;
            }
-            auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
-                return map_key_type->compare(element.first, *map_key) == 0;
+            auto found = with_linearized(*map_key, [&] (bytes_view map_key_bv) {
+              return std::find_if(elements.begin(), end, [&] (auto&& element) {
+                return map_key_type->compare(element.first, map_key_bv) == 0;
+              });
            });
            if (found == end) {
                return false;
            }
-            auto cmp = found->second.value().with_linearized([&] (bytes_view value_bv) {
-                return element_type->compare(value_bv, *map_value);
+            auto cmp = with_linearized(*map_value, [&] (bytes_view map_value_bv) {
+              return found->second.value().with_linearized([&] (bytes_view value_bv) {
+                return element_type->compare(value_bv, map_value_bv);
+              });
            });
            if (cmp != 0) {
                return false;
@@ -622,13 +766,14 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
            return _column_def.type->deserialize(cell_value_bv);
        });
        for (auto&& value : _values) {
-            auto val = value->bind_and_get(options);
-            if (!val) {
+            auto fragmented_val = value->bind_and_get(options);
+            if (!fragmented_val) {
                continue;
            }
+          return with_linearized(*fragmented_val, [&] (bytes_view val) {
            auto exists_in = [&](auto&& range) {
                auto found = std::find_if(range.begin(), range.end(), [&] (auto&& element) {
-                    return element_type->compare(element.serialize(), *val) == 0;
+                    return element_type->compare(element.serialize(), val) == 0;
                });
                return found != range.end();
            };
@@ -646,6 +791,8 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
                    return false;
                }
            }
+            return true;
+          });
        }
        if (col_type->is_map()) {
            auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
@@ -654,8 +801,10 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
                if (!k) {
                    continue;
                }
-                auto found = std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
-                    return map_key_type->compare(element.first.serialize(), *k) == 0;
+                auto found = with_linearized(*k, [&] (bytes_view k_bv) {
+                  return std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
+                    return map_key_type->compare(element.first.serialize(), k_bv) == 0;
+                  });
                });
                if (found == data_map.end()) {
                    return false;
@@ -667,10 +816,15 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
                if (!map_key || !map_value) {
                    continue;
                }
-                auto found = std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
-                    return map_key_type->compare(element.first.serialize(), *map_key) == 0;
+                auto found = with_linearized(*map_key, [&] (bytes_view map_key_bv) {
+                  return std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
+                    return map_key_type->compare(element.first.serialize(), map_key_bv) == 0;
+                  });
                });
-                if (found == data_map.end() || element_type->compare(found->second.serialize(), *map_value) != 0) {
+                if (found == data_map.end()
+                    || with_linearized(*map_value, [&] (bytes_view map_value_bv) {
+                         return element_type->compare(found->second.serialize(), map_value_bv);
+                       }) != 0) {
                    return false;
                }
            }
@@ -680,6 +834,11 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
    return true;
 }

+bool single_column_restriction::contains::is_satisfied_by(bytes_view data, const query_options& options) const {
+    //TODO(sarna): Deserialize & return. It would be nice to deduplicate, is_satisfied_by above is rather long
+    fail(unimplemented::cause::INDEXES);
+}
+
 bool token_restriction::EQ::is_satisfied_by(const schema& schema,
        const partition_key& key,
        const clustering_key_prefix& ckey,
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -120,8 +120,8 @@ public:
        bool for_view = false,
        bool allow_filtering = false);
 private:
-    void add_restriction(::shared_ptr<restriction> restriction);
-    void add_single_column_restriction(::shared_ptr<single_column_restriction> restriction);
+    void add_restriction(::shared_ptr<restriction> restriction, bool for_view, bool allow_filtering);
+    void add_single_column_restriction(::shared_ptr<single_column_restriction> restriction, bool for_view, bool allow_filtering);
 public:
    bool uses_function(const sstring& ks_name, const sstring& function_name) const;

@@ -163,6 +163,20 @@ public:
        return _clustering_columns_restrictions;
    }

+    /**
+     * Builds a possibly empty collection of column definitions that will be used for filtering
+     * @param db - the database context
+     * @return A list with the column definitions needed for filtering.
+     */
+    std::vector<const column_definition*> get_column_defs_for_filtering(database& db) const;
+
+    /**
+     * Determines the index to be used with the restriction.
+     * @param db - the database context (for extracting index manager)
+     * @return If an index can be used, an optional containing this index, otherwise an empty optional.
+     */
+    std::optional<secondary_index::index> find_idx(secondary_index::secondary_index_manager& sim) const;
+
    /**
     * Checks if the partition key has some unrestricted components.
     * @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
@@ -175,7 +189,7 @@ public:
     */
    bool has_unrestricted_clustering_columns() const;
 private:
-    void process_partition_key_restrictions(bool has_queriable_index, bool for_view);
+    void process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering);

    /**
     * Returns the partition key components that are not restricted.
@@ -190,7 +204,7 @@ private:
     * @param select_a_collection <code>true</code> if the query should return a collection column
     * @throws InvalidRequestException if the request is invalid
     */
-    void process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view);
+    void process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view, bool allow_filtering);

    /**
     * Returns the <code>Restrictions</code> for the specified type of columns.
@@ -358,7 +372,7 @@ public:
     * Checks if the query need to use filtering.
     * @return <code>true</code> if the query need to use filtering, <code>false</code> otherwise.
     */
-    bool need_filtering();
+    bool need_filtering() const;

    void validate_secondary_index_selections(bool selects_only_static_columns);

@@ -381,6 +395,14 @@ public:
        return !_nonprimary_key_restrictions->empty();
    }

+    bool pk_restrictions_need_filtering() const {
+        return _partition_key_restrictions->needs_filtering(*_schema);
+    }
+
+    bool ck_restrictions_need_filtering() const {
+        return _partition_key_restrictions->has_unrestricted_components(*_schema) || _clustering_columns_restrictions->needs_filtering(*_schema);
+    }
+
    /**
     * @return true if column is restricted by some restriction, false otherwise
     */
@@ -399,6 +421,16 @@ public:
    const single_column_restrictions::restrictions_map& get_non_pk_restriction() const {
        return _nonprimary_key_restrictions->restrictions();
    }
+
+    /**
+     * @return partition key restrictions split into single column restrictions (e.g. for filtering support).
+     */
+    const single_column_restrictions::restrictions_map& get_single_column_partition_key_restrictions() const;
+
+    /**
+     * @return clustering key restrictions split into single column restrictions (e.g. for filtering support).
+     */
+    const single_column_restrictions::restrictions_map& get_single_column_clustering_key_restrictions() const;
 };

 }
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -45,27 +45,25 @@ namespace cql3 {

 metadata::metadata(std::vector<::shared_ptr<column_specification>> names_)
        : _flags(flag_enum_set())
-        , names(std::move(names_)) {
-    _column_count = names.size();
-}
+        , _column_info(make_lw_shared<column_info>(std::move(names_)))
+{ }

 metadata::metadata(flag_enum_set flags, std::vector<::shared_ptr<column_specification>> names_, uint32_t column_count,
        ::shared_ptr<const service::pager::paging_state> paging_state)
    : _flags(flags)
-    , names(std::move(names_))
-    , _column_count(column_count)
+    , _column_info(make_lw_shared<column_info>(std::move(names_), column_count))
    , _paging_state(std::move(paging_state))
 { }

 // The maximum number of values that the ResultSet can hold. This can be bigger than columnCount due to CASSANDRA-4911
 uint32_t metadata::value_count() const {
-    return _flags.contains<flag::NO_METADATA>() ? _column_count : names.size();
+    return _flags.contains<flag::NO_METADATA>() ? _column_info->_column_count : _column_info->_names.size();
 }

 void metadata::add_non_serialized_column(::shared_ptr<column_specification> name) {
    // See comment above. Because columnCount doesn't account the newly added name, it
    // won't be serialized.
-    names.emplace_back(std::move(name));
+    _column_info->_names.emplace_back(std::move(name));
 }

 bool metadata::all_in_same_cf() const {
@@ -73,18 +71,24 @@ bool metadata::all_in_same_cf() const {
        return false;
    }

-    return column_specification::all_in_same_table(names);
+    return column_specification::all_in_same_table(_column_info->_names);
 }

-void metadata::set_has_more_pages(::shared_ptr<const service::pager::paging_state> paging_state) {
-    if (!paging_state) {
-        return;
-    }
-
+void metadata::set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state) {
    _flags.set<flag::HAS_MORE_PAGES>();
    _paging_state = std::move(paging_state);
 }

+void metadata::maybe_set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state) {
+    assert(paging_state);
+    if (paging_state->get_remaining() > 0) {
+        set_paging_state(std::move(paging_state));
+    } else {
+        _flags.remove<flag::HAS_MORE_PAGES>();
+        _paging_state = nullptr;
+    }
+}
+
 void metadata::set_skip_metadata() {
    _flags.set<flag::NO_METADATA>();
 }
@@ -93,18 +97,10 @@ metadata::flag_enum_set metadata::flags() const {
    return _flags;
 }

-uint32_t metadata::column_count() const {
-    return _column_count;
-}
-
 ::shared_ptr<const service::pager::paging_state> metadata::paging_state() const {
    return _paging_state;
 }

-const std::vector<::shared_ptr<column_specification>>& metadata::get_names() const {
-    return names;
-}
-
 prepared_metadata::prepared_metadata(const std::vector<::shared_ptr<column_specification>>& names,
                                     const std::vector<uint16_t>& partition_key_bind_indices)
    : _names{names}
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -70,18 +70,29 @@ public:

    using flag_enum_set = enum_set<flag_enum>;

-private:
-    flag_enum_set _flags;
-
-public:
+    struct column_info {
    // Please note that columnCount can actually be smaller than names, even if names is not null. This is
    // used to include columns in the resultSet that we need to do post-query re-orderings
    // (SelectStatement.orderResults) but that shouldn't be sent to the user as they haven't been requested
    // (CASSANDRA-4911). So the serialization code will exclude any columns in name whose index is >= columnCount.
-    std::vector<::shared_ptr<column_specification>> names;
+        std::vector<::shared_ptr<column_specification>> _names;
+        uint32_t _column_count;
+
+        column_info(std::vector<::shared_ptr<column_specification>> names, uint32_t column_count)
+            : _names(std::move(names))
+            , _column_count(column_count)
+        { }
+
+        explicit column_info(std::vector<::shared_ptr<column_specification>> names)
+            : _names(std::move(names))
+            , _column_count(_names.size())
+        { }
+    };
+private:
+    flag_enum_set _flags;

 private:
-    uint32_t _column_count;
+    lw_shared_ptr<column_info> _column_info;
    ::shared_ptr<const service::pager::paging_state> _paging_state;

 public:
@@ -99,17 +110,20 @@ private:
    bool all_in_same_cf() const;

 public:
-    void set_has_more_pages(::shared_ptr<const service::pager::paging_state> paging_state);
+    void set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state);
+    void maybe_set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state);

    void set_skip_metadata();

    flag_enum_set flags() const;

-    uint32_t column_count() const;
+    uint32_t column_count() const { return _column_info->_column_count; }

    ::shared_ptr<const service::pager::paging_state> paging_state() const;

-    const std::vector<::shared_ptr<column_specification>>& get_names() const;
+    const std::vector<::shared_ptr<column_specification>>& get_names() const {
+        return _column_info->_names;
+    }
 };

 ::shared_ptr<const cql3::metadata> make_empty_metadata();
@@ -223,14 +237,14 @@ public:
 class result {
    std::unique_ptr<cql3::result_set> _result_set;
    result_generator _result_generator;
-    shared_ptr<cql3::metadata> _metadata;
+    shared_ptr<const cql3::metadata> _metadata;
 public:
    explicit result(std::unique_ptr<cql3::result_set> rs)
        : _result_set(std::move(rs))
        , _metadata(_result_set->_metadata)
    { }

-    explicit result(result_generator generator, shared_ptr<metadata> m)
+    explicit result(result_generator generator, shared_ptr<const metadata> m)
        : _result_generator(std::move(generator))
        , _metadata(std::move(m))
    { }
@@ -240,7 +254,7 @@ public:
        if (_result_set) {
            return *_result_set;
        } else {
-            auto builder = result_set::builder(_metadata);
+            auto builder = result_set::builder(make_shared<cql3::metadata>(*_metadata));
            _result_generator.visit(builder);
            return std::move(builder).get_result_set();
        }
--- a/cql3/selection/selectable.cc
+++ b/cql3/selection/selectable.cc
@@ -142,7 +142,7 @@ shared_ptr<selector::factory>
 selectable::with_field_selection::new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) {
    auto&& factory = _selected->new_selector_factory(db, s, defs);
    auto&& type = factory->new_instance()->get_type();
-    auto&& ut = dynamic_pointer_cast<const user_type_impl>(std::move(type));
+    auto&& ut = dynamic_pointer_cast<const user_type_impl>(type->underlying_type());
    if (!ut) {
        throw exceptions::invalid_request_exception(
                sprint("Invalid field selection: %s of type %s is not a user type",
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -40,6 +40,7 @@
 */

 #include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/adaptor/filtered.hpp>

 #include "cql3/selection/selection.hh"
 #include "cql3/selection/selector_factories.hh"
@@ -155,9 +156,9 @@ public:
        return _factories->uses_function(ks_name, function_name);
    }

-    virtual uint32_t add_column_for_ordering(const column_definition& c) override {
-        uint32_t index = selection::add_column_for_ordering(c);
-        _factories->add_selector_for_ordering(c, index);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c) override {
+        uint32_t index = selection::add_column_for_post_processing(c);
+        _factories->add_selector_for_post_processing(c, index);
        return index;
    }

@@ -208,9 +209,17 @@ protected:

 ::shared_ptr<selection> selection::wildcard(schema_ptr schema) {
    auto columns = schema->all_columns_in_select_order();
-    auto cds = boost::copy_range<std::vector<const column_definition*>>(columns | boost::adaptors::transformed([](const column_definition& c) {
-        return &c;
-    }));
+    // filter out hidden columns, which should not be seen by the
+    // user when doing "SELECT *". We also disallow selecting them
+    // individually (see column_identifier::new_selector_factory()).
+    auto cds = boost::copy_range<std::vector<const column_definition*>>(
+        columns |
+        boost::adaptors::filtered([](const column_definition& c) {
+            return !c.is_view_virtual();
+        }) |
+        boost::adaptors::transformed([](const column_definition& c) {
+            return &c;
+        }));
    return simple_selection::make(schema, std::move(cds), true);
 }

@@ -218,7 +227,7 @@ protected:
    return simple_selection::make(schema, std::move(columns), false);
 }

-uint32_t selection::add_column_for_ordering(const column_definition& c) {
+uint32_t selection::add_column_for_post_processing(const column_definition& c) {
    _columns.push_back(&c);
    _metadata->add_non_serialized_column(c.column_specification);
    return _columns.size() - 1;
@@ -330,93 +339,106 @@ std::unique_ptr<result_set> result_set_builder::build() {
    return std::move(_result_set);
 }

-result_set_builder::visitor::visitor(
-        cql3::selection::result_set_builder& builder, const schema& s,
-        const selection& selection)
-        : _builder(builder), _schema(s), _selection(selection), _row_count(0) {
-}
+bool result_set_builder::restrictions_filter::do_filter(const selection& selection,
+                                                         const std::vector<bytes>& partition_key,
+                                                         const std::vector<bytes>& clustering_key,
+                                                         const query::result_row_view& static_row,
+                                                         const query::result_row_view& row) const {
+    static logging::logger rlogger("restrictions_filter");

-void result_set_builder::visitor::add_value(const column_definition& def,
-        query::result_row_view::iterator_type& i) {
-    if (def.type->is_multi_cell()) {
-        auto cell = i.next_collection_cell();
-        if (!cell) {
-            _builder.add_empty();
-            return;
-        }
-        _builder.add_collection(def, cell->linearize());
-    } else {
-        auto cell = i.next_atomic_cell();
-        if (!cell) {
-            _builder.add_empty();
-            return;
-        }
-        _builder.add(def, *cell);
+    if (_current_partition_key_does_not_match || _current_static_row_does_not_match || _remaining == 0) {
+        return false;
    }
-}

-void result_set_builder::visitor::accept_new_partition(const partition_key& key,
-        uint32_t row_count) {
-    _partition_key = key.explode(_schema);
-    _row_count = row_count;
-}
-
-void result_set_builder::visitor::accept_new_partition(uint32_t row_count) {
-    _row_count = row_count;
-}
-
-void result_set_builder::visitor::accept_new_row(const clustering_key& key,
-        const query::result_row_view& static_row,
-        const query::result_row_view& row) {
-    _clustering_key = key.explode(_schema);
-    accept_new_row(static_row, row);
-}
-
-void result_set_builder::visitor::accept_new_row(
-        const query::result_row_view& static_row,
-        const query::result_row_view& row) {
    auto static_row_iterator = static_row.iterator();
    auto row_iterator = row.iterator();
-    _builder.new_row();
-    for (auto&& def : _selection.get_columns()) {
-        switch (def->kind) {
-        case column_kind::partition_key:
-            _builder.add(_partition_key[def->component_index()]);
-            break;
-        case column_kind::clustering_key:
-            if (_clustering_key.size() > def->component_index()) {
-                _builder.add(_clustering_key[def->component_index()]);
+    auto non_pk_restrictions_map = _restrictions->get_non_pk_restriction();
+    auto partition_key_restrictions_map = _restrictions->get_single_column_partition_key_restrictions();
+    auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
+    for (auto&& cdef : selection.get_columns()) {
+        switch (cdef->kind) {
+        case column_kind::static_column:
+            // fallthrough
+        case column_kind::regular_column: {
+            auto& cell_iterator = (cdef->kind == column_kind::static_column) ? static_row_iterator : row_iterator;
+            if (cdef->type->is_multi_cell()) {
+                cell_iterator.next_collection_cell();
+                auto restr_it = non_pk_restrictions_map.find(cdef);
+                if (restr_it == non_pk_restrictions_map.end()) {
+                    continue;
+                }
+                throw exceptions::invalid_request_exception("Collection filtering is not supported yet");
            } else {
-                _builder.add({});
+                auto cell = cell_iterator.next_atomic_cell();
+
+                auto restr_it = non_pk_restrictions_map.find(cdef);
+                if (restr_it == non_pk_restrictions_map.end()) {
+                    continue;
+                }
+                restrictions::single_column_restriction& restriction = *restr_it->second;
+
+                bool regular_restriction_matches;
+                if (cell) {
+                    regular_restriction_matches = cell->value().with_linearized([&restriction, this](bytes_view data) {
+                        return restriction.is_satisfied_by(data, _options);
+                    });
+                } else {
+                    regular_restriction_matches = restriction.is_satisfied_by(bytes(), _options);
+                }
+                if (!regular_restriction_matches) {
+                    _current_static_row_does_not_match = (cdef->kind == column_kind::static_column);
+                    return false;
+                }
+
+            }
            }
            break;
-        case column_kind::regular_column:
-            add_value(*def, row_iterator);
+        case column_kind::partition_key: {
+            auto restr_it = partition_key_restrictions_map.find(cdef);
+            if (restr_it == partition_key_restrictions_map.end()) {
+                continue;
+            }
+            restrictions::single_column_restriction& restriction = *restr_it->second;
+            const bytes& value_to_check = partition_key[cdef->id];
+            bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, _options);
+            if (!pk_restriction_matches) {
+                _current_partition_key_does_not_match = true;
+                return false;
+            }
+            }
            break;
-        case column_kind::static_column:
-            add_value(*def, static_row_iterator);
+        case column_kind::clustering_key: {
+            auto restr_it = clustering_key_restrictions_map.find(cdef);
+            if (restr_it == clustering_key_restrictions_map.end()) {
+                continue;
+            }
+            restrictions::single_column_restriction& restriction = *restr_it->second;
+            const bytes& value_to_check = clustering_key[cdef->id];
+            bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, _options);
+            if (!pk_restriction_matches) {
+                return false;
+            }
+            }
            break;
        default:
-            assert(0);
+            break;
        }
    }
+    return true;
 }

-void result_set_builder::visitor::accept_partition_end(
-        const query::result_row_view& static_row) {
-    if (_row_count == 0) {
-        _builder.new_row();
-        auto static_row_iterator = static_row.iterator();
-        for (auto&& def : _selection.get_columns()) {
-            if (def->is_partition_key()) {
-                _builder.add(_partition_key[def->component_index()]);
-            } else if (def->is_static()) {
-                add_value(*def, static_row_iterator);
-            } else {
-                _builder.add_empty();
-            }
-        }
+bool result_set_builder::restrictions_filter::operator()(const selection& selection,
+                                                         const std::vector<bytes>& partition_key,
+                                                         const std::vector<bytes>& clustering_key,
+                                                         const query::result_row_view& static_row,
+                                                         const query::result_row_view& row) const {
+    const bool accepted = do_filter(selection, partition_key, clustering_key, static_row, row);
+    if (!accepted) {
+        ++_rows_dropped;
+    } else if (_remaining > 0) {
+        --_remaining;
    }
+    return accepted;
 }

 api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -48,6 +48,7 @@
 #include "exceptions/exceptions.hh"
 #include "cql3/selection/raw_selector.hh"
 #include "cql3/selection/selector_factories.hh"
+#include "cql3/restrictions/statement_restrictions.hh"
 #include "unimplemented.hh"

 namespace cql3 {
@@ -168,10 +169,14 @@ public:
        return _metadata;
    }

+    ::shared_ptr<metadata> get_result_metadata() {
+        return _metadata;
+    }
+
    static ::shared_ptr<selection> wildcard(schema_ptr schema);
    static ::shared_ptr<selection> for_columns(schema_ptr schema, std::vector<const column_definition*> columns);

-    virtual uint32_t add_column_for_ordering(const column_definition& c);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c);

    virtual bool uses_function(const sstring &ks_name, const sstring& function_name) const {
        return false;
@@ -247,6 +252,40 @@ private:
    const gc_clock::time_point _now;
    cql_serialization_format _cql_serialization_format;
 public:
+    class nop_filter {
+    public:
+        inline bool operator()(const selection&, const std::vector<bytes>&, const std::vector<bytes>&, const query::result_row_view&, const query::result_row_view&) const {
+            return true;
+        }
+        void reset() {
+        }
+        uint32_t get_rows_dropped() const {
+            return 0;
+        }
+    };
+    class restrictions_filter {
+        ::shared_ptr<restrictions::statement_restrictions> _restrictions;
+        const query_options& _options;
+        mutable bool _current_partition_key_does_not_match = false;
+        mutable bool _current_static_row_does_not_match = false;
+        mutable uint32_t _rows_dropped = 0;
+        mutable uint32_t _remaining = 0;
+    public:
+        restrictions_filter() = default;
+        explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options, uint32_t remaining) : _restrictions(restrictions), _options(options), _remaining(remaining) {}
+        bool operator()(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
+        void reset() {
+            _current_partition_key_does_not_match = false;
+            _current_static_row_does_not_match = false;
+            _rows_dropped = 0;
+        }
+        uint32_t get_rows_dropped() const {
+            return _rows_dropped;
+        }
+    private:
+        bool do_filter(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
+    };
+
    result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
    void add_empty();
    void add(bytes_opt value);
@@ -256,8 +295,9 @@ public:
    std::unique_ptr<result_set> build();
    api::timestamp_type timestamp_of(size_t idx);
    int32_t ttl_of(size_t idx);
-    
+
    // Implements ResultVisitor concept from query.hh
+    template<typename Filter = nop_filter>
    class visitor {
    protected:
        result_set_builder& _builder;
@@ -266,20 +306,101 @@ public:
        uint32_t _row_count;
        std::vector<bytes> _partition_key;
        std::vector<bytes> _clustering_key;
+        Filter _filter;
    public:
-        visitor(cql3::selection::result_set_builder& builder, const schema& s, const selection&);
+        visitor(cql3::selection::result_set_builder& builder, const schema& s,
+                const selection& selection, Filter filter = Filter())
+            : _builder(builder)
+            , _schema(s)
+            , _selection(selection)
+            , _row_count(0)
+            , _filter(filter)
+        {}
        visitor(visitor&&) = default;

-        void add_value(const column_definition& def, query::result_row_view::iterator_type& i);
-        void accept_new_partition(const partition_key& key, uint32_t row_count);
-        void accept_new_partition(uint32_t row_count);
-        void accept_new_row(const clustering_key& key,
-                const query::result_row_view& static_row,
-                const query::result_row_view& row);
-        void accept_new_row(const query::result_row_view& static_row,
-                const query::result_row_view& row);
-        void accept_partition_end(const query::result_row_view& static_row);
+        void add_value(const column_definition& def, query::result_row_view::iterator_type& i) {
+            if (def.type->is_multi_cell()) {
+                auto cell = i.next_collection_cell();
+                if (!cell) {
+                    _builder.add_empty();
+                    return;
+                }
+                _builder.add_collection(def, cell->linearize());
+            } else {
+                auto cell = i.next_atomic_cell();
+                if (!cell) {
+                    _builder.add_empty();
+                    return;
+                }
+                _builder.add(def, *cell);
+            }
+        }
+
+        void accept_new_partition(const partition_key& key, uint32_t row_count) {
+            _partition_key = key.explode(_schema);
+            _row_count = row_count;
+            _filter.reset();
+        }
+
+        void accept_new_partition(uint32_t row_count) {
+            _row_count = row_count;
+            _filter.reset();
+        }
+
+        void accept_new_row(const clustering_key& key, const query::result_row_view& static_row, const query::result_row_view& row) {
+            _clustering_key = key.explode(_schema);
+            accept_new_row(static_row, row);
+        }
+
+        void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) {
+            auto static_row_iterator = static_row.iterator();
+            auto row_iterator = row.iterator();
+            if (!_filter(_selection, _partition_key, _clustering_key, static_row, row)) {
+                return;
+            }
+            _builder.new_row();
+            for (auto&& def : _selection.get_columns()) {
+                switch (def->kind) {
+                case column_kind::partition_key:
+                    _builder.add(_partition_key[def->component_index()]);
+                    break;
+                case column_kind::clustering_key:
+                    if (_clustering_key.size() > def->component_index()) {
+                        _builder.add(_clustering_key[def->component_index()]);
+                    } else {
+                        _builder.add({});
+                    }
+                    break;
+                case column_kind::regular_column:
+                    add_value(*def, row_iterator);
+                    break;
+                case column_kind::static_column:
+                    add_value(*def, static_row_iterator);
+                    break;
+                default:
+                    assert(0);
+                }
+            }
+        }
+
+        uint32_t accept_partition_end(const query::result_row_view& static_row) {
+            if (_row_count == 0) {
+                _builder.new_row();
+                auto static_row_iterator = static_row.iterator();
+                for (auto&& def : _selection.get_columns()) {
+                    if (def->is_partition_key()) {
+                        _builder.add(_partition_key[def->component_index()]);
+                    } else if (def->is_static()) {
+                        add_value(*def, static_row_iterator);
+                    } else {
+                        _builder.add_empty();
+                    }
+                }
+            }
+            return _filter.get_rows_dropped();
+        }
    };
+
 private:
    bytes_opt get_value(data_type t, query::result_atomic_cell_view c);
 };
--- a/cql3/selection/selector_factories.cc
+++ b/cql3/selection/selector_factories.cc
@@ -53,6 +53,7 @@ selector_factories::selector_factories(std::vector<::shared_ptr<selectable>> sel
    : _contains_write_time_factory(false)
    , _contains_ttl_factory(false)
    , _number_of_aggregate_factories(0)
+    , _number_of_factories_for_post_processing(0)
 {
    _factories.reserve(selectables.size());

@@ -76,8 +77,9 @@ bool selector_factories::uses_function(const sstring& ks_name, const sstring& fu
    return false;
 }

-void selector_factories::add_selector_for_ordering(const column_definition& def, uint32_t index) {
+void selector_factories::add_selector_for_post_processing(const column_definition& def, uint32_t index) {
    _factories.emplace_back(simple_selector::new_factory(def.name_as_text(), index, def.type));
+    ++_number_of_factories_for_post_processing;
 }

 std::vector<::shared_ptr<selector>> selector_factories::new_instances() const {
--- a/cql3/selection/selector_factories.hh
+++ b/cql3/selection/selector_factories.hh
@@ -74,6 +74,11 @@ private:
     */
    uint32_t _number_of_aggregate_factories;

+    /**
+     * The number of factories that are only for post processing.
+     */
+    uint32_t _number_of_factories_for_post_processing;
+
 public:
    /**
     * Creates a new <code>SelectorFactories</code> instance and collect the column definitions.
@@ -97,11 +102,12 @@ public:
    bool uses_function(const sstring& ks_name, const sstring& function_name) const;

    /**
-     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY purposes.
+     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY or post
+     * processing purposes.
     * @param def the column that is needed for ordering
     * @param index the index of the column definition in the Selection's list of columns
     */
-    void add_selector_for_ordering(const column_definition& def, uint32_t index);
+    void add_selector_for_post_processing(const column_definition& def, uint32_t index);

    /**
     * Checks if this <code>SelectorFactories</code> contains only factories for aggregates.
@@ -111,7 +117,7 @@ public:
     */
    bool contains_only_aggregate_functions() const {
        auto size = _factories.size();
-        return size != 0 && _number_of_aggregate_factories == size;
+        return size != 0 && _number_of_aggregate_factories  == (size - _number_of_factories_for_post_processing);
    }

    /**
--- a/cql3/sets.cc
+++ b/cql3/sets.cc
@@ -120,17 +120,19 @@ sets::literal::to_string() const {
 }

 sets::value
-sets::value::from_serialized(bytes_view v, set_type type, cql_serialization_format sf) {
+sets::value::from_serialized(const fragmented_temporary_buffer::view& val, set_type type, cql_serialization_format sf) {
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserializeForNativeProtocol?!
+      return with_linearized(val, [&] (bytes_view v) {
        auto s = value_cast<set_type_impl::native_type>(type->deserialize(v, sf));
        std::set<bytes, serialized_compare> elements(type->get_elements_type()->as_less_comparator());
        for (auto&& element : s) {
            elements.insert(elements.end(), type->get_elements_type()->decompose(element));
        }
        return value(std::move(elements));
+      });
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(e.what());
    }
@@ -198,10 +200,10 @@ sets::delayed_value::bind(const query_options& options) {
            return constants::UNSET_VALUE;
        }
        // We don't support value > 64K because the serialization format encode the length as an unsigned short.
-        if (b->size() > std::numeric_limits<uint16_t>::max()) {
+        if (b->size_bytes() > std::numeric_limits<uint16_t>::max()) {
            throw exceptions::invalid_request_exception(sprint("Set value is too long. Set values are limited to %d bytes but %d bytes value provided",
                    std::numeric_limits<uint16_t>::max(),
-                    b->size()));
+                    b->size_bytes()));
        }

        buffers.insert(buffers.end(), std::move(to_bytes(*b)));
@@ -269,7 +271,7 @@ sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const upd
        }

        for (auto&& e : set_value->_elements) {
-            mut.cells.emplace_back(e, params.make_cell(*set_type->value_comparator(), {}, atomic_cell::collection_member::yes));
+            mut.cells.emplace_back(e, params.make_cell(*set_type->value_comparator(), bytes_view(), atomic_cell::collection_member::yes));
        }
        auto smut = set_type->serialize_mutation_form(mut);

@@ -279,7 +281,7 @@ sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const upd
        auto v = set_type->serialize_partially_deserialized_form(
                {set_value->_elements.begin(), set_value->_elements.end()},
                cql_serialization_format::internal());
-        m.set_cell(row_key, column, params.make_cell(*column.type, std::move(v)));
+        m.set_cell(row_key, column, params.make_cell(*column.type, fragmented_temporary_buffer::view(v)));
    } else {
        m.set_cell(row_key, column, params.make_dead_cell());
    }
--- a/cql3/sets.hh
+++ b/cql3/sets.hh
@@ -78,7 +78,7 @@ public:
        value(std::set<bytes, serialized_compare> elements)
                : _elements(std::move(elements)) {
        }
-        static value from_serialized(bytes_view v, set_type type, cql_serialization_format sf);
+        static value from_serialized(const fragmented_temporary_buffer::view& v, set_type type, cql_serialization_format sf);
        virtual cql3::raw_value get(const query_options& options) override;
        virtual bytes get_with_protocol_version(cql_serialization_format sf) override;
        bool equals(set_type st, const value& v);
--- a/cql3/single_column_relation.cc
+++ b/cql3/single_column_relation.cc
@@ -101,13 +101,6 @@ single_column_relation::to_receivers(schema_ptr schema, const column_definition&
    }

    if (is_IN()) {
-        // For partition keys we only support IN for the last name so far
-        if (column_def.is_partition_key() && !schema->is_last_partition_key(column_def)) {
-            throw exceptions::invalid_request_exception(sprint(
-                "Partition KEY part %s cannot be restricted by IN relation (only the last part of the partition key can)",
-                column_def.name_as_text()));
-        }
-
        // We only allow IN on the row key and the clustering key so far, never on non-PK columns, and this even if
        // there's an index
        // Note: for backward compatibility reason, we conside a IN of 1 value the same as a EQ, so we let that
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -246,18 +246,22 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a

        cfm.with_column(column_name->name(), type, _is_static ? column_kind::static_column : column_kind::regular_column);

-        // Adding a column to a table which has an include all view requires the column to be added to the view
-        // as well. If the view has a regular base column in its PK, then the column ID needs to be updated in
-        // view_info; for that, rebuild the schema.
+        // Adding a column to a base table always requires updating the view
+        // schemas: If the view includes all columns it should include the new
+        // column, but if it doesn't, it may need to include the new
+        // unselected column as a virtual column. The case when it we
+        // shouldn't add a virtual column is when the view has in its PK one
+        // of the base's regular columns - but even in this case we need to
+        // rebuild the view schema, to update the column ID.
        if (!_is_static) {
            for (auto&& view : cf.views()) {
-                if (view->view_info()->include_all_columns() || view->view_info()->base_non_pk_column_in_view_pk()) {
-                    schema_builder builder(view);
-                    if (view->view_info()->include_all_columns()) {
-                        builder.with_column(column_name->name(), type);
-                    }
-                    view_updates.push_back(view_ptr(builder.build()));
+                schema_builder builder(view);
+                if (view->view_info()->include_all_columns()) {
+                    builder.with_column(column_name->name(), type);
+                } else if (!view->view_info()->base_non_pk_column_in_view_pk()) {
+                    db::view::create_virtual_column(builder, column_name->name(), type);
                }
+                view_updates.push_back(view_ptr(builder.build()));
            }
        }

@@ -272,7 +276,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a

        auto type = validate_alter(schema, *def, *validator);
        // In any case, we update the column definition
-        cfm.with_altered_column_type(column_name->name(), type);
+        cfm.alter_column_type(column_name->name(), type);

        // We also have to validate the view types here. If we have a view which includes a column as part of
        // the clustering key, we need to make sure that it is indeed compatible.
@@ -281,7 +285,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
            if (view_def) {
                schema_builder builder(view);
                auto view_type = validate_alter(view, *view_def, *validator);
-                builder.with_altered_column_type(column_name->name(), std::move(view_type));
+                builder.alter_column_type(column_name->name(), std::move(view_type));
                view_updates.push_back(view_ptr(builder.build()));
            }
        }
@@ -302,7 +306,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
        } else {
            for (auto&& column_def : boost::range::join(schema->static_columns(), schema->regular_columns())) { // find
                if (column_def.name() == column_name->name()) {
-                    cfm.without_column(column_name->name());
+                    cfm.remove_column(column_name->name());
                    break;
                }
            }
@@ -345,9 +349,10 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
            auto to = entry.second->prepare_column_identifier(schema);

            validate_column_rename(db, *schema, *from, *to);
-            cfm.with_column_rename(from->name(), to->name());
+            cfm.rename_column(from->name(), to->name());

-            // If the view includes a renamed column, it must be renamed in the view table and the definition.
+            // If the view includes a renamed column, it must be renamed in
+            // the view table and the definition.
            for (auto&& view : cf.views()) {
                if (view->get_column_definition(from->name())) {
                    schema_builder builder(view);
@@ -355,7 +360,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
                    auto view_from = entry.first->prepare_column_identifier(view);
                    auto view_to = entry.second->prepare_column_identifier(view);
                    validate_column_rename(db, *view, *view_from, *view_to);
-                    builder.with_column_rename(view_from->name(), view_to->name());
+                    builder.rename_column(view_from->name(), view_to->name());

                    auto new_where = util::rename_column_in_where_clause(
                            view->view_info()->where_clause(),
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -110,7 +110,7 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b
            if (t_opt) {
                modified = true;
                // We need to update this column
-                cfm.with_altered_column_type(column.name(), *t_opt);
+                cfm.alter_column_type(column.name(), *t_opt);
            }
        }
        if (modified) {
@@ -165,7 +165,7 @@ alter_type_statement::add_or_alter::add_or_alter(const ut_name& name, bool is_ad
 user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
 {
    if (get_idx_of_field(to_update, _field_name)) {
-        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->to_string(), _name.to_string()));
    }

    std::vector<bytes> new_names(to_update->field_names());
@@ -173,7 +173,7 @@ user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_
    std::vector<data_type> new_types(to_update->field_types());
    auto&& add_type = _field_type->prepare(db, keyspace())->get_type();
    if (add_type->references_user_type(to_update->_keyspace, to_update->_name)) {
-        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s of type %s to type %s as this would create a circular reference", _field_name->name(), _field_type->to_string(), _name.to_string()));
+        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s of type %s to type %s as this would create a circular reference", _field_name->to_string(), _field_type->to_string(), _name.to_string()));
    }
    new_types.push_back(std::move(add_type));
    return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), std::move(new_types));
@@ -183,13 +183,13 @@ user_type alter_type_statement::add_or_alter::do_alter(database& db, user_type t
 {
    stdx::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
    if (!idx) {
-        throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->to_string(), _name.to_string()));
    }

    auto previous = to_update->field_types()[*idx];
    auto new_type = _field_type->prepare(db, keyspace())->get_type();
    if (!new_type->is_compatible_with(*previous)) {
-        throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->to_string(), _name.to_string()));
    }

    std::vector<data_type> new_types(to_update->field_types());
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -191,20 +191,20 @@ const std::vector<batch_statement::single_statement>& batch_statement::get_state
    return _statements;
 }

-future<std::vector<mutation>> batch_statement::get_mutations(service::storage_proxy& storage, const query_options& options, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state) {
+future<std::vector<mutation>> batch_statement::get_mutations(service::storage_proxy& storage, const query_options& options, db::timeout_clock::time_point timeout, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state) {
    // Do not process in parallel because operations like list append/prepend depend on execution order.
    using mutation_set_type = std::unordered_set<mutation, mutation_hash_by_key, mutation_equals_by_key>;
-    return do_with(mutation_set_type(), [this, &storage, &options, now, local, trace_state] (auto& result) {
+    return do_with(mutation_set_type(), [this, &storage, &options, timeout, now, local, trace_state] (auto& result) {
        result.reserve(_statements.size());
        _stats.statements_in_batches += _statements.size();
        return do_for_each(boost::make_counting_iterator<size_t>(0),
                           boost::make_counting_iterator<size_t>(_statements.size()),
-                           [this, &storage, &options, now, local, &result, trace_state] (size_t i) {
+                           [this, &storage, &options, now, local, &result, timeout, trace_state] (size_t i) {
            auto&& statement = _statements[i].statement;
            statement->inc_cql_stats();
            auto&& statement_options = options.for_statement(i);
            auto timestamp = _attrs->get_timestamp(now, statement_options);
-            return statement->get_mutations(storage, statement_options, local, timestamp, trace_state).then([&result] (auto&& more) {
+            return statement->get_mutations(storage, statement_options, timeout, local, timestamp, trace_state).then([&result] (auto&& more) {
                for (auto&& m : more) {
                    // We want unordered_set::try_emplace(), but we don't have it
                    auto pos = result.find(m);
@@ -293,8 +293,9 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
        return execute_with_conditions(storage, options, query_state);
    }

-    return get_mutations(storage, options, local, now, query_state.get_trace_state()).then([this, &storage, &options, tr_state = query_state.get_trace_state()] (std::vector<mutation> ms) mutable {
-        return execute_without_conditions(storage, std::move(ms), options.get_consistency(), std::move(tr_state));
+    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    return get_mutations(storage, options, timeout, local, now, query_state.get_trace_state()).then([this, &storage, &options, timeout, tr_state = query_state.get_trace_state()] (std::vector<mutation> ms) mutable {
+        return execute_without_conditions(storage, std::move(ms), options.get_consistency(), timeout, std::move(tr_state));
    }).then([] {
        return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
                make_shared<cql_transport::messages::result_message::void_message>());
@@ -305,6 +306,7 @@ future<> batch_statement::execute_without_conditions(
        service::storage_proxy& storage,
        std::vector<mutation> mutations,
        db::consistency_level cl,
+        db::timeout_clock::time_point timeout,
        tracing::trace_state_ptr tr_state)
 {
    // FIXME: do we need to do this?
@@ -332,7 +334,7 @@ future<> batch_statement::execute_without_conditions(
            mutate_atomic = false;
        }
    }
-    return storage.mutate_with_triggers(std::move(mutations), cl, mutate_atomic, std::move(tr_state));
+    return storage.mutate_with_triggers(std::move(mutations), cl, timeout, mutate_atomic, std::move(tr_state));
 }

 future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute_with_conditions(
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -125,7 +125,7 @@ public:

    const std::vector<single_statement>& get_statements();
 private:
-    future<std::vector<mutation>> get_mutations(service::storage_proxy& storage, const query_options& options, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state);
+    future<std::vector<mutation>> get_mutations(service::storage_proxy& storage, const query_options& options, db::timeout_clock::time_point timeout, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state);

 public:
    /**
@@ -147,6 +147,7 @@ private:
            service::storage_proxy& storage,
            std::vector<mutation> mutations,
            db::consistency_level cl,
+            db::timeout_clock::time_point timeout,
            tracing::trace_state_ptr tr_state);

    future<shared_ptr<cql_transport::messages::result_message>> execute_with_conditions(
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -88,6 +88,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
        throw exceptions::invalid_request_exception("Secondary indexes are not supported on materialized views");
    }

+    if (schema->is_dense()) {
+        throw exceptions::invalid_request_exception(
+                "Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns");
+    }
+
    std::vector<::shared_ptr<index_target>> targets;
    for (auto& raw_target : _raw_targets) {
        targets.emplace_back(raw_target->prepare(schema));
@@ -109,6 +114,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
                    sprint("No column definition found for column %s", *target->column));
        }

+        //NOTICE(sarna): Should be lifted after resolving issue #2963
+        if (cd->is_static()) {
+            throw exceptions::invalid_request_exception("Indexing static columns is not implemented yet.");
+        }
+
        if (cd->type->references_duration()) {
            using request_validations::check_false;
            const auto& ty = *cd->type;
@@ -122,8 +132,7 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
        }

        // Origin TODO: we could lift that limitation
-        if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) &&
-            cd->kind != column_kind::regular_column) {
+        if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) && cd->is_primary_key()) {
            throw exceptions::invalid_request_exception(
                    "Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables");
        }
@@ -137,10 +146,15 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c

        bool is_map = dynamic_cast<const collection_type_impl *>(cd->type.get()) != nullptr
                      && dynamic_cast<const collection_type_impl *>(cd->type.get())->is_map();
-        bool is_frozen_collection = cd->type->is_collection() && !cd->type->is_multi_cell();
+        bool is_collection = cd->type->is_collection();
+        bool is_frozen_collection = is_collection && !cd->type->is_multi_cell();

        if (is_frozen_collection) {
            validate_for_frozen_collection(target);
+        } else if (is_collection) {
+            // NOTICE(sarna): should be lifted after #2962 (indexes on non-frozen collections) is implemented
+            throw exceptions::invalid_request_exception(
+                    sprint("Cannot create secondary index on non-frozen collection column %s", cd->name_as_text()));
        } else {
            validate_not_full_index(target);
            validate_is_values_index_if_target_column_not_collection(cd, target);
--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -84,7 +84,6 @@ create_view_statement::create_view_statement(
    , _clustering_keys{clustering_keys}
    , _if_not_exists{if_not_exists}
 {
-    service::get_local_storage_proxy().get_db().local().get_config().check_experimental("Creating materialized views");
    if (!service::get_local_storage_service().cluster_supports_materialized_views()) {
        throw exceptions::invalid_request_exception("Can't create materialized views until the whole cluster has been upgraded");
    }
@@ -275,6 +274,7 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a

    std::vector<const column_definition*> missing_pk_columns;
    std::vector<const column_definition*> target_non_pk_columns;
+    std::vector<const column_definition*> unselected_columns;

    // We need to include all of the primary key columns from the base table in order to make sure that we do not
    // overwrite values in the view. We cannot support "collapsing" the base table into a smaller number of rows in
@@ -292,6 +292,9 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
        if (included_def && !def_in_target_pk) {
            target_non_pk_columns.push_back(&def);
        }
+        if (!included_def && !def_in_target_pk && !def.is_static()) {
+            unselected_columns.push_back(&def);
+        }
        if (def.is_primary_key() && !def_in_target_pk) {
            missing_pk_columns.push_back(&def);
        }
@@ -311,6 +314,27 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
        throw exceptions::invalid_request_exception(sprint("No columns are defined for Materialized View other than primary key"));
    }

+    // The unique feature of a filter by a non-key column is that the
+    // value of such column can be updated - and also be expired with TTL
+    // and cause the view row to appear and disappear. We don't currently
+    // support support this case - see issue #3430, and neither does
+    // Cassandra - see see CASSANDRA-13798 and CASSANDRA-13832.
+    // Actually, as CASSANDRA-13798 explains, the problem is "the liveness of
+    // view row is now depending on multiple base columns (multiple filtered
+    // non-pk base column + base column used in view pk)". When the filtered
+    // column *is* the base column added to the view pk, we don't have this
+    // problem. And this case actually works correctly.
+    auto non_pk_restrictions = restrictions->get_non_pk_restriction();
+    if (non_pk_restrictions.size() == 1 && has_non_pk_column &&
+            std::find(target_primary_keys.begin(), target_primary_keys.end(), non_pk_restrictions.cbegin()->first) != target_primary_keys.end()) {
+        // This case (filter by new PK column of the view) works, as explained above
+    } else if (!non_pk_restrictions.empty()) {
+        auto column_names = ::join(", ", non_pk_restrictions | boost::adaptors::map_keys | boost::adaptors::transformed(std::mem_fn(&column_definition::name_as_text)));
+        throw exceptions::invalid_request_exception(sprint(
+                "Non-primary key columns cannot be restricted in the SELECT statement used for materialized view %s creation (got restrictions on: %s)",
+                column_family(), column_names));
+    }
+
    schema_builder builder{keyspace(), column_family()};
    auto add_columns = [this, &builder] (std::vector<const column_definition*>& defs, column_kind kind) mutable {
        for (auto* def : defs) {
@@ -321,6 +345,19 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
    add_columns(target_partition_keys, column_kind::partition_key);
    add_columns(target_clustering_keys, column_kind::clustering_key);
    add_columns(target_non_pk_columns, column_kind::regular_column);
+    // Add all unselected columns (base-table columns which are not selected
+    // in the view) as "virtual columns" - columns which have timestamp and
+    // ttl information, but an empty value. These are needed to keep view
+    // rows alive when the base row is alive, even if the view row has no
+    // data, just a key (see issue #3362). The virtual columns are not needed
+    // when the view pk adds a regular base column (i.e., has_non_pk_column)
+    // because in that case, the liveness of that base column is what
+    // determines the liveness of the view row.
+    if (!has_non_pk_column) {
+        for (auto* def : unselected_columns) {
+            db::view::create_virtual_column(builder, def->name(), def->type);
+        }
+    }
    _properties.properties()->apply_to_builder(builder, proxy.get_db().local().get_config().extensions());

    if (builder.default_time_to_live().count() > 0) {
--- a/cql3/statements/index_prop_defs.cc
+++ b/cql3/statements/index_prop_defs.cc
@@ -49,7 +49,7 @@ void cql3::statements::index_prop_defs::validate() {
    property_definitions::validate(keywords);

    if (is_custom && !custom_class) {
-        throw exceptions::invalid_request_exception("CUSTOM index requires specifiying the index class");
+        throw exceptions::invalid_request_exception("CUSTOM index requires specifying the index class");
    }

    if (!is_custom && custom_class) {
@@ -64,6 +64,16 @@ void cql3::statements::index_prop_defs::validate() {
                sprint("Cannot specify %s as a CUSTOM option",
                        db::index::secondary_index::custom_index_option_name));
    }
+
+    // Currently, Scylla does not support *any* class of custom index
+    // implementation. If in the future we do (e.g., SASI, or something
+    // new), we'll need to check for valid values here.
+    if (is_custom && custom_class) {
+        throw exceptions::invalid_request_exception(
+                format("Unsupported CUSTOM INDEX class {}. Note that currently, Scylla does not support SASI or any other CUSTOM INDEX class.",
+                        *custom_class));
+
+    }
 }

 index_options_map
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -160,11 +160,11 @@ future<> modification_statement::check_access(const service::client_state& state
 }

 future<std::vector<mutation>>
-modification_statement::get_mutations(service::storage_proxy& proxy, const query_options& options, bool local, int64_t now, tracing::trace_state_ptr trace_state) {
+modification_statement::get_mutations(service::storage_proxy& proxy, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, tracing::trace_state_ptr trace_state) {
    auto json_cache = maybe_prepare_json_cache(options);
    auto keys = make_lw_shared(build_partition_keys(options, json_cache));
    auto ranges = make_lw_shared(create_clustering_ranges(options, json_cache));
-    return make_update_parameters(proxy, keys, ranges, options, local, now, std::move(trace_state)).then(
+    return make_update_parameters(proxy, keys, ranges, options, timeout, local, now, std::move(trace_state)).then(
            [this, keys, ranges, now, json_cache = std::move(json_cache)] (auto params_ptr) {
                std::vector<mutation> mutations;
                mutations.reserve(keys->size());
@@ -186,10 +186,11 @@ modification_statement::make_update_parameters(
        lw_shared_ptr<dht::partition_range_vector> keys,
        lw_shared_ptr<query::clustering_row_ranges> ranges,
        const query_options& options,
+        db::timeout_clock::time_point timeout,
        bool local,
        int64_t now,
        tracing::trace_state_ptr trace_state) {
-    return read_required_rows(proxy, *keys, std::move(ranges), local, options, std::move(trace_state)).then(
+    return read_required_rows(proxy, *keys, std::move(ranges), local, options, timeout, std::move(trace_state)).then(
            [this, &options, now] (auto rows) {
                return make_ready_future<std::unique_ptr<update_parameters>>(
                        std::make_unique<update_parameters>(s, options,
@@ -275,6 +276,7 @@ modification_statement::read_required_rows(
        lw_shared_ptr<query::clustering_row_ranges> ranges,
        bool local,
        const query_options& options,
+        db::timeout_clock::time_point timeout,
        tracing::trace_state_ptr trace_state) {
    if (!requires_read()) {
        return make_ready_future<update_parameters::prefetched_rows_type>(
@@ -308,7 +310,6 @@ modification_statement::read_required_rows(
                query::partition_slice::option::collections_as_maps>());
    query::read_command cmd(s->id(), s->version(), ps, std::numeric_limits<uint32_t>::max());
    // FIXME: ignoring "local"
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
    return proxy.query(s, make_lw_shared(std::move(cmd)), std::move(keys),
            cl, {timeout, std::move(trace_state)}).then([this, ps] (auto qr) {
        return query::result_view::do_with(*qr.query_result, [&] (query::result_view v) {
@@ -408,12 +409,13 @@ modification_statement::execute_without_condition(service::storage_proxy& proxy,
        db::validate_for_write(s->ks_name(), cl);
    }

-    return get_mutations(proxy, options, false, options.get_timestamp(qs), qs.get_trace_state()).then([this, cl, &proxy, &qs] (auto mutations) {
+    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    return get_mutations(proxy, options, timeout, false, options.get_timestamp(qs), qs.get_trace_state()).then([this, cl, timeout, &proxy, &qs] (auto mutations) {
        if (mutations.empty()) {
            return now();
        }

-        return proxy.mutate_with_triggers(std::move(mutations), cl, false, qs.get_trace_state(), this->is_raw_counter_shard_write());
+        return proxy.mutate_with_triggers(std::move(mutations), cl, timeout, false, qs.get_trace_state(), this->is_raw_counter_shard_write());
    });
 }

--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -206,6 +206,7 @@ protected:
                lw_shared_ptr<query::clustering_row_ranges> ranges,
                bool local,
                const query_options& options,
+                db::timeout_clock::time_point now,
                tracing::trace_state_ptr trace_state);
 private:
    future<::shared_ptr<cql_transport::messages::result_message>>
@@ -349,7 +350,7 @@ public:
     * @return vector of the mutations
     * @throws invalid_request_exception on invalid requests
     */
-    future<std::vector<mutation>> get_mutations(service::storage_proxy& proxy, const query_options& options, bool local, int64_t now, tracing::trace_state_ptr trace_state);
+    future<std::vector<mutation>> get_mutations(service::storage_proxy& proxy, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, tracing::trace_state_ptr trace_state);

 public:
    future<std::unique_ptr<update_parameters>> make_update_parameters(
@@ -357,6 +358,7 @@ public:
                lw_shared_ptr<dht::partition_range_vector> keys,
                lw_shared_ptr<query::clustering_row_ranges> ranges,
                const query_options& options,
+                db::timeout_clock::time_point timeout,
                bool local,
                int64_t now,
                tracing::trace_state_ptr trace_state);
--- a/cql3/statements/raw/insert_statement.hh
+++ b/cql3/statements/raw/insert_statement.hh
@@ -87,6 +87,7 @@ private:
    ::shared_ptr<attributes::raw> _attrs;
    ::shared_ptr<term::raw> _json_value;
    bool _if_not_exists;
+    bool _default_unset;
 public:
    /**
     * A parsed <code>INSERT JSON</code> statement.
@@ -95,7 +96,7 @@ public:
     * @param json_value JSON string representing names and values
     * @param attrs additional attributes for statement (CL, timestamp, timeToLive)
     */
-    insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists);
+    insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists, bool default_unset);

    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
                ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats) override;
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -118,7 +118,8 @@ private:
        schema_ptr schema,
        ::shared_ptr<variable_specifications> bound_names,
        ::shared_ptr<selection::selection> selection,
-        bool for_view = false);
+        bool for_view = false,
+        bool allow_filtering = false);

    /** Returns a ::shared_ptr<term> for the limit or null if no limit is set */
    ::shared_ptr<term> prepare_limit(database& db, ::shared_ptr<variable_specifications> bound_names);
@@ -140,6 +141,10 @@ private:
    /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
    void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);

+    void ensure_filtering_columns_retrieval(database& db,
+                                            ::shared_ptr<selection::selection> selection,
+                                            ::shared_ptr<restrictions::statement_restrictions> restrictions);
+
    bool contains_alias(::shared_ptr<column_identifier> name);

    ::shared_ptr<column_specification> limit_receiver();
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -45,6 +45,7 @@
 #include "transport/messages/result_message.hh"
 #include "cql3/selection/selection.hh"
 #include "cql3/util.hh"
+#include "cql3/restrictions/single_column_primary_key_restrictions.hh"
 #include "core/shared_ptr.hh"
 #include "query-result-reader.hh"
 #include "query_result_merger.hh"
@@ -312,13 +313,14 @@ select_statement::make_partition_slice(const query_options& options)
    if (_is_reversed) {
        _opts.set(query::partition_slice::option::reversed);
        std::reverse(bounds.begin(), bounds.end());
+        ++_stats.reverse_queries;
    }
    return query::partition_slice(std::move(bounds),
        std::move(static_columns), std::move(regular_columns), _opts, nullptr, options.get_cql_serialization_format());
 }

 int32_t select_statement::get_limit(const query_options& options) const {
-    if (!_limit) {
+    if (!_limit || _selection->is_aggregate()) {
        return std::numeric_limits<int32_t>::max();
    }

@@ -329,9 +331,10 @@ int32_t select_statement::get_limit(const query_options& options) const {
    if (val.is_unset_value()) {
        return std::numeric_limits<int32_t>::max();
    }
+  return with_linearized(*val, [&] (bytes_view bv) {
    try {
-        int32_type->validate(*val);
-        auto l = value_cast<int32_t>(int32_type->deserialize(*val));
+        int32_type->validate(bv);
+        auto l = value_cast<int32_t>(int32_type->deserialize(bv));
        if (l <= 0) {
            throw exceptions::invalid_request_exception("LIMIT must be strictly positive");
        }
@@ -339,6 +342,7 @@ int32_t select_statement::get_limit(const query_options& options) const {
    } catch (const marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid limit value");
    }
+  });
 }

 bool select_statement::needs_post_query_ordering() const {
@@ -379,45 +383,54 @@ select_statement::do_execute(service::storage_proxy& proxy,
    int32_t limit = get_limit(options);
    auto now = gc_clock::now();

+    const bool restrictions_need_filtering = _restrictions->need_filtering();
    ++_stats.reads;
+    _stats.filtered_reads += restrictions_need_filtering;

    auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
        make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));

    int32_t page_size = options.get_page_size();

+    _stats.unpaged_select_queries += page_size <= 0;
+
    // An aggregation query will never be paged for the user, but we always page it internally to avoid OOM.
    // If we user provided a page_size we'll use that to page internally (because why not), otherwise we use our default
    // Note that if there are some nodes in the cluster with a version less than 2.0, we can't use paging (CASSANDRA-6707).
-    auto aggregate = _selection->is_aggregate();
-    if (aggregate && page_size <= 0) {
+    const bool aggregate = _selection->is_aggregate();
+    const bool nonpaged_filtering = restrictions_need_filtering && page_size <= 0;
+    if (aggregate || nonpaged_filtering) {
        page_size = DEFAULT_COUNT_PAGE_SIZE;
    }

    auto key_ranges = _restrictions->get_partition_key_ranges(options);

-    if (!aggregate && (page_size <= 0
-            || !service::pager::query_pagers::may_need_paging(page_size,
+    if (!aggregate && !restrictions_need_filtering && (page_size <= 0
+            || !service::pager::query_pagers::may_need_paging(*_schema, page_size,
                    *command, key_ranges))) {
        return execute(proxy, command, std::move(key_ranges), state, options, now);
    }

    command->slice.options.set<query::partition_slice::option::allow_short_read>();
-    auto timeout = options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout_duration = options.get_timeout_config().*get_timeout_config_selector();
    auto p = service::pager::query_pagers::pager(_schema, _selection,
-            state, options, timeout, command, std::move(key_ranges));
+            state, options, command, std::move(key_ranges), _stats, restrictions_need_filtering ? _restrictions : nullptr);

-    if (aggregate) {
+    if (aggregate || nonpaged_filtering) {
        return do_with(
                cql3::selection::result_set_builder(*_selection, now,
                        options.get_cql_serialization_format()),
-                [this, p, page_size, now](auto& builder) {
+                [this, p, page_size, now, timeout_duration, restrictions_need_filtering](auto& builder) {
                    return do_until([p] {return p->is_exhausted();},
-                            [p, &builder, page_size, now] {
-                                return p->fetch_page(builder, page_size, now);
+                            [p, &builder, page_size, now, timeout_duration] {
+                                auto timeout = db::timeout_clock::now() + timeout_duration;
+                                return p->fetch_page(builder, page_size, now, timeout);
                            }
-                    ).then([this, &builder] {
+                    ).then([this, &builder, restrictions_need_filtering] {
                                auto rs = builder.build();
+                                if (restrictions_need_filtering) {
+                                    _stats.filtered_rows_matched_total += rs->size();
+                                }
                                update_stats_rows_read(rs->size());
                                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
@@ -431,12 +444,18 @@ select_statement::do_execute(service::storage_proxy& proxy,
                        " you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
    }

-    if (_selection->is_trivial()) {
-        return p->fetch_page_generator(page_size, now, _stats).then([this, p, limit] (result_generator generator) {
-            auto meta = make_shared<metadata>(*_selection->get_result_metadata());
-            if (!p->is_exhausted()) {
-                meta->set_has_more_pages(p->state());
-            }
+    auto timeout = db::timeout_clock::now() + timeout_duration;
+    if (_selection->is_trivial() && !restrictions_need_filtering) {
+        return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p, limit] (result_generator generator) {
+            auto meta = [&] () -> shared_ptr<const cql3::metadata> {
+                if (!p->is_exhausted()) {
+                    auto meta = make_shared<metadata>(*_selection->get_result_metadata());
+                    meta->set_paging_state(p->state());
+                    return meta;
+                } else {
+                    return _selection->get_result_metadata();
+                }
+            }();

            return shared_ptr<cql_transport::messages::result_message>(
                make_shared<cql_transport::messages::result_message::rows>(result(std::move(generator), std::move(meta)))
@@ -444,19 +463,220 @@ select_statement::do_execute(service::storage_proxy& proxy,
        });
    }

-    return p->fetch_page(page_size, now).then(
-            [this, p, &options, limit, now](std::unique_ptr<cql3::result_set> rs) {
+    return p->fetch_page(page_size, now, timeout).then(
+            [this, p, &options, now, restrictions_need_filtering](std::unique_ptr<cql3::result_set> rs) {

                if (!p->is_exhausted()) {
-                    rs->get_metadata().set_has_more_pages(p->state());
+                    rs->get_metadata().set_paging_state(p->state());
                }

+                if (restrictions_need_filtering) {
+                    _stats.filtered_rows_matched_total += rs->size();
+                }
                update_stats_rows_read(rs->size());
                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
            });
 }

+template<typename KeyType>
+GCC6_CONCEPT(
+    requires (std::is_same_v<KeyType, partition_key> || std::is_same_v<KeyType, clustering_key_prefix>)
+)
+static KeyType
+generate_base_key_from_index_pk(const partition_key& index_pk, const clustering_key& index_ck, const schema& base_schema, const schema& view_schema) {
+    const auto& base_columns = std::is_same_v<KeyType, partition_key> ? base_schema.partition_key_columns() : base_schema.clustering_key_columns();
+    std::vector<bytes_view> exploded_base_key;
+    exploded_base_key.reserve(base_columns.size());
+
+    for (const column_definition& base_col : base_columns) {
+        const column_definition* view_col = view_schema.view_info()->view_column(base_col);
+        if (view_col->is_partition_key()) {
+            exploded_base_key.push_back(index_pk.get_component(view_schema, view_col->id));
+        } else {
+            exploded_base_key.push_back(index_ck.get_component(view_schema, view_col->id));
+        }
+    }
+    return KeyType::from_range(exploded_base_key);
+}
+
+lw_shared_ptr<query::read_command>
+indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) {
+    lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
+            _schema->id(),
+            _schema->version(),
+            make_partition_slice(options),
+            get_limit(options),
+            now,
+            tracing::make_trace_info(state.get_trace_state()),
+            query::max_partitions,
+            utils::UUID(),
+            options.get_timestamp(state));
+    if (use_paging) {
+        cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
+        cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
+        if (_schema->clustering_key_size() > 0) {
+            cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
+        }
+    }
+    return cmd;
+}
+
+future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+indexed_table_select_statement::do_execute_base_query(
+        service::storage_proxy& proxy,
+        dht::partition_range_vector&& partition_ranges,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
+    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    dht::partition_range_vector per_vnode_ranges;
+    per_vnode_ranges.reserve(partition_ranges.size());
+    for (auto& pr : partition_ranges) {
+        auto restricted_ranges = proxy.get_restricted_ranges(*_schema, pr);
+        std::move(restricted_ranges.begin(), restricted_ranges.end(), std::back_inserter(per_vnode_ranges));
+    }
+
+    struct base_query_state {
+        query::result_merger merger;
+        dht::partition_range_vector per_vnode_ranges;
+        dht::partition_range_vector::iterator current_partition_range;
+        base_query_state(uint32_t row_limit, dht::partition_range_vector&& ranges)
+                : merger(row_limit * ranges.size(), query::max_partitions)
+                , per_vnode_ranges(std::move(ranges))
+                , current_partition_range(per_vnode_ranges.begin())
+                {}
+        base_query_state(base_query_state&&) = default;
+        base_query_state(const base_query_state&) = delete;
+    };
+
+    base_query_state query_state{cmd->row_limit, std::move(per_vnode_ranges)};
+    return do_with(std::move(query_state), [this, &proxy, &state, &options, cmd, timeout] (auto&& query_state) {
+        auto &merger = query_state.merger;
+        auto &ranges = query_state.per_vnode_ranges;
+        auto &range_it = query_state.current_partition_range;
+        return repeat([this, &ranges, &range_it, &merger, &proxy, &state, &options, cmd, timeout]() {
+            // Starting with 1 range, we check if the result was a short read, and if not,
+            // we continue exponentially, asking for 2x more ranges than before
+            auto range_it_end = std::min(range_it + std::distance(ranges.begin(), range_it) + 1, ranges.end());
+            dht::partition_range_vector prange(range_it, range_it_end);
+            auto command = ::make_lw_shared<query::read_command>(*cmd);
+            auto old_paging_state = options.get_paging_state();
+            if (old_paging_state && range_it == ranges.begin()) {
+                auto base_pk = generate_base_key_from_index_pk<partition_key>(old_paging_state->get_partition_key(),
+                        *old_paging_state->get_clustering_key(), *_schema, *_view_schema);
+                auto base_ck = generate_base_key_from_index_pk<clustering_key>(old_paging_state->get_partition_key(),
+                        *old_paging_state->get_clustering_key(), *_schema, *_view_schema);
+                command->slice.set_range(*_schema, base_pk,
+                        std::vector<query::clustering_range>{query::clustering_range::make_starting_with(range_bound<clustering_key>(base_ck, false))});
+            }
+            return proxy.query(_schema, command, std::move(prange), options.get_consistency(), {timeout, state.get_trace_state()})
+            .then([&range_it, range_it_end = std::move(range_it_end), &ranges, &merger] (service::storage_proxy::coordinator_query_result qr) {
+                bool is_short_read = qr.query_result->is_short_read();
+                merger(std::move(qr.query_result));
+                range_it = range_it_end;
+                return stop_iteration(is_short_read || range_it == ranges.end());
+            });
+        }).then([&merger]() {
+            return merger.get();
+        });
+    }).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
+        return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
+    });
+}
+
+future<shared_ptr<cql_transport::messages::result_message>>
+indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        dht::partition_range_vector&& partition_ranges,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    return do_execute_base_query(proxy, std::move(partition_ranges), state, options, now, paging_state).then(
+            [this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
+        return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
+    });
+}
+
+future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+indexed_table_select_statement::do_execute_base_query(
+        service::storage_proxy& proxy,
+        std::vector<primary_key>&& primary_keys,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
+    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+
+    struct base_query_state {
+        query::result_merger merger;
+        std::vector<primary_key> primary_keys;
+        std::vector<primary_key>::iterator current_primary_key;
+        base_query_state(uint32_t row_limit, std::vector<primary_key>&& keys)
+                : merger(row_limit, query::max_partitions)
+                , primary_keys(std::move(keys))
+                , current_primary_key(primary_keys.begin())
+                {}
+        base_query_state(base_query_state&&) = default;
+        base_query_state(const base_query_state&) = delete;
+    };
+
+    base_query_state query_state{cmd->row_limit, std::move(primary_keys)};
+    return do_with(std::move(query_state), [this, &proxy, &state, &options, cmd, timeout] (auto&& query_state) {
+        auto &merger = query_state.merger;
+        auto &keys = query_state.primary_keys;
+        auto &key_it = query_state.current_primary_key;
+        return repeat([this, &keys, &key_it, &merger, &proxy, &state, &options, cmd, timeout]() {
+            // Starting with 1 key, we check if the result was a short read, and if not,
+            // we continue exponentially, asking for 2x more key than before
+            auto key_it_end = std::min(key_it + std::distance(keys.begin(), key_it) + 1, keys.end());
+            auto command = ::make_lw_shared<query::read_command>(*cmd);
+
+            query::result_merger oneshot_merger(cmd->row_limit, query::max_partitions);
+            return map_reduce(key_it, key_it_end, [this, &proxy, &state, &options, cmd, timeout] (auto& key) {
+                auto command = ::make_lw_shared<query::read_command>(*cmd);
+                // for each partition, read just one clustering row (TODO: can
+                // get all needed rows of one partition at once.)
+                command->slice._row_ranges.clear();
+                if (key.clustering) {
+                    command->slice._row_ranges.push_back(query::clustering_range::make_singular(key.clustering));
+                }
+                return proxy.query(_schema, command, {dht::partition_range::make_singular(key.partition)}, options.get_consistency(), {timeout, state.get_trace_state()})
+                .then([] (service::storage_proxy::coordinator_query_result qr) {
+                    return std::move(qr.query_result);
+                });
+            }, std::move(oneshot_merger)).then([&key_it, key_it_end = std::move(key_it_end), &keys, &merger] (foreign_ptr<lw_shared_ptr<query::result>> result) {
+                bool is_short_read = result->is_short_read();
+                merger(std::move(result));
+                key_it = key_it_end;
+                return stop_iteration(is_short_read || key_it == keys.end());
+            });
+        }).then([&merger] () {
+            return merger.get();
+        }).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
+            return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
+        });
+    });
+}
+
+future<shared_ptr<cql_transport::messages::result_message>>
+indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        std::vector<primary_key>&& primary_keys,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    return do_execute_base_query(proxy, std::move(primary_keys), state, options, now, paging_state).then(
+            [this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
+        return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
+    });
+}
+
 future<shared_ptr<cql_transport::messages::result_message>>
 select_statement::execute(service::storage_proxy& proxy,
                          lw_shared_ptr<query::read_command> cmd,
@@ -496,52 +716,21 @@ select_statement::execute(service::storage_proxy& proxy,
    }
 }

-// Function for fetching the selected columns from a list of clustering rows.
-// It is currently used only in our Secondary Index implementation - ordinary
-// CQL SELECT statements do not have the syntax to request a list of rows.
-// FIXME: The current implementation is very inefficient - it requests each
-// row separately (and all in parallel). Even multiple rows from a single
-// partition are requested separately. This last case can be easily improved,
-// but to implement the general case (multiple rows from multiple partitions)
-// efficiently, we will need more support from other layers.
-// Keys are ordered in token order (see #3423)
-future<shared_ptr<cql_transport::messages::result_message>>
-select_statement::execute(service::storage_proxy& proxy,
-                          lw_shared_ptr<query::read_command> cmd,
-                          std::vector<primary_key>&& primary_keys,
-                          service::query_state& state,
-                          const query_options& options,
-                          gc_clock::time_point now)
+shared_ptr<cql_transport::messages::result_message>
+indexed_table_select_statement::process_base_query_results(
+        foreign_ptr<lw_shared_ptr<query::result>> results,
+        lw_shared_ptr<query::read_command> cmd,
+        service::storage_proxy& proxy,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state)
 {
-    // FIXME: pass the timeout from caller. The query has already started
-    // earlier (with read_posting_list()), not now.
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
-    return do_with(std::move(primary_keys), [this, &proxy, &state, &options, cmd, timeout] (auto& keys) {
-        assert(cmd->partition_limit == query::max_partitions);
-        query::result_merger merger(cmd->row_limit, query::max_partitions);
-        // there is no point to produce rows beyond the first row_limit:
-        auto end = keys.size() <= cmd->row_limit ? keys.end() : keys.begin() + cmd->row_limit;
-        return map_reduce(keys.begin(), end, [this, &proxy, &state, &options, cmd, timeout] (auto& key) {
-            auto command = ::make_lw_shared<query::read_command>(*cmd);
-            // for each partition, read just one clustering row (TODO: can
-            // get all needed rows of one partition at once.)
-            command->slice._row_ranges.clear();
-            if (key.clustering) {
-                command->slice._row_ranges.push_back(query::clustering_range::make_singular(key.clustering));
-            }
-            return proxy.query(_schema,
-                    command,
-                    {dht::partition_range::make_singular(key.partition)},
-                    options.get_consistency(),
-                    {timeout, state.get_trace_state()}).then([] (service::storage_proxy::coordinator_query_result qr) {
-                return std::move(qr.query_result);
-            });
-        }, std::move(merger));
-    }).then([this, &options, now, cmd] (auto result) {
-        // note that cmd here still has the garbage clustering range in slice,
-        // but process_results() ignores this part of the slice setting.
-        return this->process_results(std::move(result), cmd, options, now);
-    });
+    if (paging_state) {
+        paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, proxy, state, options);
+        _selection->get_result_metadata()->maybe_set_paging_state(std::move(paging_state));
+    }
+    return process_results(std::move(results), std::move(cmd), options, now);
 }

 shared_ptr<cql_transport::messages::result_message>
@@ -550,7 +739,8 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
                                  const query_options& options,
                                  gc_clock::time_point now)
 {
-    bool fast_path = !needs_post_query_ordering() && _selection->is_trivial();
+    const bool restrictions_need_filtering = _restrictions->need_filtering();
+    const bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !restrictions_need_filtering;
    if (fast_path) {
        return make_shared<cql_transport::messages::result_message::rows>(result(
            result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
@@ -560,9 +750,17 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu

    cql3::selection::result_set_builder builder(*_selection, now,
            options.get_cql_serialization_format());
-    query::result_view::consume(*results, cmd->slice,
-            cql3::selection::result_set_builder::visitor(builder, *_schema,
-                    *_selection));
+    if (restrictions_need_filtering) {
+        results->ensure_counts();
+        _stats.filtered_rows_read_total += *results->row_count();
+        query::result_view::consume(*results, cmd->slice,
+                cql3::selection::result_set_builder::visitor(builder, *_schema,
+                        *_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
+    } else {
+        query::result_view::consume(*results, cmd->slice,
+                cql3::selection::result_set_builder::visitor(builder, *_schema,
+                        *_selection));
+    }
    auto rs = builder.build();

    if (needs_post_query_ordering()) {
@@ -573,6 +771,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
        rs->trim(cmd->row_limit);
    }
    update_stats_rows_read(rs->size());
+    _stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
    return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
 }

@@ -601,10 +800,16 @@ indexed_table_select_statement::prepare(database& db,
                                        ordering_comparator_type ordering_comparator,
                                        ::shared_ptr<term> limit, cql_stats &stats)
 {
-    auto index_opt = find_idx(db, schema, restrictions);
+    auto& sim = db.find_column_family(schema).get_index_manager();
+    auto index_opt = restrictions->find_idx(sim);
    if (!index_opt) {
        throw std::runtime_error("No index found.");
    }
+
+    const auto& im = index_opt->metadata();
+    sstring index_table_name = im.name() + "_index";
+    schema_ptr view_schema = db.find_schema(schema->ks_name(), index_table_name);
+
    return ::make_shared<cql3::statements::indexed_table_select_statement>(
            schema,
            bound_terms,
@@ -615,28 +820,11 @@ indexed_table_select_statement::prepare(database& db,
            std::move(ordering_comparator),
            limit,
            stats,
-            *index_opt);
+            *index_opt,
+            view_schema);

 }

-
-stdx::optional<secondary_index::index> indexed_table_select_statement::find_idx(database& db,
-                                                                                schema_ptr schema,
-                                                                                ::shared_ptr<restrictions::statement_restrictions> restrictions)
-{
-    auto& sim = db.find_column_family(schema).get_index_manager();
-    for (::shared_ptr<cql3::restrictions::restrictions> restriction : restrictions->index_restrictions()) {
-        for (const auto& cdef : restriction->get_column_defs()) {
-            for (auto index : sim.list_indexes()) {
-                if (index.depends_on(*cdef)) {
-                    return stdx::make_optional<secondary_index::index>(std::move(index));
-                }
-            }
-        }
-    }
-    return stdx::nullopt;
-}
-
 indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
                                                           ::shared_ptr<parameters> parameters,
                                                           ::shared_ptr<selection::selection> selection,
@@ -644,16 +832,74 @@ indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema
                                                           bool is_reversed,
                                                           ordering_comparator_type ordering_comparator,
                                                           ::shared_ptr<term> limit, cql_stats &stats,
-                                                           const secondary_index::index& index)
+                                                           const secondary_index::index& index,
+                                                           schema_ptr view_schema)
    : select_statement{schema, bound_terms, parameters, selection, restrictions, is_reversed, ordering_comparator, limit, stats}
    , _index{index}
+    , _view_schema(view_schema)
 {}

+template<typename KeyType>
+GCC6_CONCEPT(
+    requires (std::is_same_v<KeyType, partition_key> || std::is_same_v<KeyType, clustering_key_prefix>)
+)
+static void append_base_key_to_index_ck(std::vector<bytes_view>& exploded_index_ck, const KeyType& base_key, const column_definition& index_cdef) {
+    auto key_view = base_key.view();
+    auto begin = key_view.begin();
+    if ((std::is_same_v<KeyType, partition_key> && index_cdef.is_partition_key())
+            || (std::is_same_v<KeyType, clustering_key_prefix> && index_cdef.is_clustering_key())) {
+        auto key_position = std::next(begin, index_cdef.id);
+        std::move(begin, key_position, std::back_inserter(exploded_index_ck));
+        begin = std::next(key_position);
+    }
+    std::move(begin, key_view.end(), std::back_inserter(exploded_index_ck));
+}
+
+::shared_ptr<const service::pager::paging_state> indexed_table_select_statement::generate_view_paging_state_from_base_query_results(::shared_ptr<const service::pager::paging_state> paging_state,
+        const foreign_ptr<lw_shared_ptr<query::result>>& results, service::storage_proxy& proxy, service::query_state& state, const query_options& options) const {
+    const column_definition* cdef = _schema->get_column_definition(to_bytes(_index.target_column()));
+    if (!cdef) {
+        throw exceptions::invalid_request_exception("Indexed column not found in schema");
+    }
+
+    //NOTICE(sarna): Executing indexed_table branch implies there was at least 1 index restriction present
+    bytes_opt index_pk_value = _restrictions->index_restrictions().front()->value_for(*cdef, options);
+    auto index_pk = partition_key::from_single_value(*_view_schema, *index_pk_value);
+    auto result_view = query::result_view(*results);
+    if (!results->row_count() || *results->row_count() == 0) {
+        return std::move(paging_state);
+    }
+    auto [last_base_pk, last_base_ck] = result_view.get_last_partition_and_clustering_key();
+
+    std::vector<bytes_view> exploded_index_ck;
+    exploded_index_ck.reserve(_view_schema->clustering_key_size());
+
+    dht::i_partitioner& partitioner = dht::global_partitioner();
+    bytes token_bytes = partitioner.token_to_bytes(partitioner.get_token(*_schema, last_base_pk));
+    exploded_index_ck.push_back(bytes_view(token_bytes));
+    append_base_key_to_index_ck<partition_key>(exploded_index_ck, last_base_pk, *cdef);
+    if (last_base_ck) {
+        append_base_key_to_index_ck<clustering_key>(exploded_index_ck, *last_base_ck, *cdef);
+    }
+
+    auto index_ck = clustering_key::from_range(std::move(exploded_index_ck));
+    if (partition_key::tri_compare(*_view_schema)(paging_state->get_partition_key(), index_pk) == 0
+            && (!paging_state->get_clustering_key() || clustering_key::prefix_equal_tri_compare(*_view_schema)(*paging_state->get_clustering_key(), index_ck) == 0)) {
+        return std::move(paging_state);
+    }
+
+    auto paging_state_copy = ::make_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
+    paging_state_copy->set_partition_key(std::move(index_pk));
+    paging_state_copy->set_clustering_key(std::move(index_ck));
+    return std::move(paging_state_copy);
+}
+
 future<shared_ptr<cql_transport::messages::result_message>>
 indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
                             service::query_state& state,
                             const query_options& options)
 {
+    tracing::add_table_name(state.get_trace_state(), _view_schema->ks_name(), _view_schema->cf_name());
    tracing::add_table_name(state.get_trace_state(), keyspace(), column_family());

    auto cl = options.get_consistency();
@@ -668,6 +914,8 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,

    assert(_restrictions->uses_secondary_indexing());

+    _stats.unpaged_select_queries += options.get_page_size() <= 0;
+
    // Secondary index search has two steps: 1. use the index table to find a
    // list of primary keys matching the query. 2. read the rows matching
    // these primary keys from the base table and return the selected columns.
@@ -700,123 +948,199 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
        }
    }

+    // Aggregated and paged filtering needs to aggregate the results from all pages
+    // in order to avoid returning partial per-page results (issue #4540).
+    // It's a little bit more complicated than regular aggregation, because each paging state
+    // needs to be translated between the base table and the underlying view.
+    // The routine below keeps fetching pages from the underlying view, which are then
+    // used to fetch base rows, which go straight to the result set builder.
+    // A local, internal copy of query_options is kept in order to keep updating
+    // the paging state between requesting data from replicas.
+    const bool aggregate = _selection->is_aggregate();
+    if (aggregate) {
+        const bool restrictions_need_filtering = _restrictions->need_filtering();
+        return do_with(cql3::selection::result_set_builder(*_selection, now, options.get_cql_serialization_format()), std::make_unique<cql3::query_options>(cql3::query_options(options)),
+                [this, &options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] (cql3::selection::result_set_builder& builder, std::unique_ptr<cql3::query_options>& internal_options) {
+            // page size is set to the internal count page size, regardless of the user-provided value
+            internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), DEFAULT_COUNT_PAGE_SIZE));
+            return repeat([this, &builder, &options, &internal_options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] () {
+                auto consume_results = [this, &builder, &options, &internal_options, restrictions_need_filtering] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
+                    if (restrictions_need_filtering) {
+                        query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection,
+                                cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
+                    } else {
+                        query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection));
+                    }
+                };
+
+                if (whole_partitions || partition_slices) {
+                    return find_index_partition_ranges(proxy, state, *internal_options).then(
+                            [this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (dht::partition_range_vector partition_ranges, ::shared_ptr<const service::pager::paging_state> paging_state) {
+                        bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
+                        internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
+                        return do_execute_base_query(proxy, std::move(partition_ranges), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
+                            return stop_iteration(!has_more_pages);
+                        });
+                    });
+                } else {
+                    return find_index_clustering_rows(proxy, state, *internal_options).then(
+                            [this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (std::vector<primary_key> primary_keys, ::shared_ptr<const service::pager::paging_state> paging_state) {
+                        bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
+                        internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
+                        return this->do_execute_base_query(proxy, std::move(primary_keys), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
+                            return stop_iteration(!has_more_pages);
+                        });
+                    });
+                }
+            }).then([this, &builder, restrictions_need_filtering] () {
+                auto rs = builder.build();
+                update_stats_rows_read(rs->size());
+                _stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
+                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
+                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
+            });
+        });
+    }
+
    if (whole_partitions || partition_slices) {
        // In this case, can use our normal query machinery, which retrieves
        // entire partitions or the same slice for many partitions.
-        return find_index_partition_ranges(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (dht::partition_range_vector partition_ranges) {
-            auto command = ::make_lw_shared<query::read_command>(
-                _schema->id(),
-                _schema->version(),
-                make_partition_slice(options),
-                limit,
-                now,
-                tracing::make_trace_info(state.get_trace_state()),
-                query::max_partitions,
-                utils::UUID(),
-                options.get_timestamp(state));
-            return this->execute(proxy, command, std::move(partition_ranges), state, options, now);
+        return find_index_partition_ranges(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (dht::partition_range_vector partition_ranges, ::shared_ptr<const service::pager::paging_state> paging_state) {
+            return this->execute_base_query(proxy, std::move(partition_ranges), state, options, now, std::move(paging_state));
        });
    } else {
        // In this case, we need to retrieve a list of rows (not entire
        // partitions) and then retrieve those specific rows.
-        return find_index_clustering_rows(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (std::vector<primary_key> primary_keys) {
-            auto command = ::make_lw_shared<query::read_command>(
-                _schema->id(),
-                _schema->version(),
-                // Note: the "clustering bounds" set in make_partition_slice()
-                // here is garbage, and will be overridden by execute() anyway
-                make_partition_slice(options),
-                limit,
-                now,
-                tracing::make_trace_info(state.get_trace_state()),
-                query::max_partitions,
-                utils::UUID(),
-                options.get_timestamp(state));
-            return this->execute(proxy, command, std::move(primary_keys), state, options, now);
+        return find_index_clustering_rows(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (std::vector<primary_key> primary_keys, ::shared_ptr<const service::pager::paging_state> paging_state) {
+            return this->execute_base_query(proxy, std::move(primary_keys), state, options, now, std::move(paging_state));
        });
    }
 }

-// Utility function for getting the schema of the materialized view used for
-// the secondary index implementation.
-static schema_ptr
-get_index_schema(service::storage_proxy& proxy,
-                const secondary_index::index& index,
-                const schema_ptr& schema,
-                tracing::trace_state_ptr& trace_state)
-{
-    const auto& im = index.metadata();
-    sstring index_table_name = im.name() + "_index";
-    tracing::add_table_name(trace_state, schema->ks_name(), index_table_name);
-    return proxy.get_db().local().find_schema(schema->ks_name(), index_table_name);
-}
-
 // Utility function for reading from the index view (get_index_view()))
 // the posting-list for a particular value of the indexed column.
 // Remember a secondary index can only be created on a single column.
-static future<service::storage_proxy::coordinator_query_result>
+template<typename KeyType>
+GCC6_CONCEPT(
+    requires (std::is_same_v<KeyType, partition_key> || std::is_same_v<KeyType, clustering_key>)
+)
+static future<::shared_ptr<cql_transport::messages::result_message::rows>>
 read_posting_list(service::storage_proxy& proxy,
                  schema_ptr view_schema,
-                  const std::vector<::shared_ptr<restrictions::restrictions>>& index_restrictions,
+                  schema_ptr base_schema,
+                  const secondary_index::index& index,
+                  ::shared_ptr<restrictions::statement_restrictions> base_restrictions,
                  const query_options& options,
                  int32_t limit,
                  service::query_state& state,
                  gc_clock::time_point now,
-                  db::timeout_clock::time_point timeout)
+                  db::timeout_clock::time_point timeout,
+                  cql3::cql_stats& stats)
 {
    dht::partition_range_vector partition_ranges;
    // FIXME: there should be only one index restriction for this index!
    // Perhaps even one index restriction entirely (do we support
    // intersection queries?).
-    for (const auto& restriction : index_restrictions) {
-        auto pk = partition_key::from_optional_exploded(*view_schema, restriction->values(options));
-        auto dk = dht::global_partitioner().decorate_key(*view_schema, pk);
-        auto range = dht::partition_range::make_singular(dk);
-        partition_ranges.emplace_back(range);
+    for (const auto& restrictions : base_restrictions->index_restrictions()) {
+        const column_definition* cdef = base_schema->get_column_definition(to_bytes(index.target_column()));
+        if (!cdef) {
+            throw exceptions::invalid_request_exception("Indexed column not found in schema");
+        }
+
+        bytes_opt value = restrictions->value_for(*cdef, options);
+        if (value) {
+            auto pk = partition_key::from_single_value(*view_schema, *value);
+            auto dk = dht::global_partitioner().decorate_key(*view_schema, pk);
+            auto range = dht::partition_range::make_singular(dk);
+            partition_ranges.emplace_back(range);
+        }
    }
+
    partition_slice_builder partition_slice_builder{*view_schema};
+
+    if (!base_restrictions->has_partition_key_unrestricted_components()) {
+        auto single_pk_restrictions = dynamic_pointer_cast<restrictions::single_column_primary_key_restrictions<partition_key>>(base_restrictions->get_partition_key_restrictions());
+        // Only EQ restrictions on base partition key can be used in an index view query
+        if (single_pk_restrictions && single_pk_restrictions->is_all_eq()) {
+            auto clustering_restrictions = ::make_shared<restrictions::single_column_primary_key_restrictions<clustering_key_prefix>>(view_schema, *single_pk_restrictions);
+            // Computed token column needs to be added to index view restrictions
+            const column_definition& token_cdef = *view_schema->clustering_key_columns().begin();
+            auto base_pk = partition_key::from_optional_exploded(*base_schema, base_restrictions->get_partition_key_restrictions()->values(options));
+            bytes token_value = dht::global_partitioner().token_to_bytes(dht::global_partitioner().get_token(*base_schema, base_pk));
+            auto token_restriction = ::make_shared<restrictions::single_column_restriction::EQ>(token_cdef, ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(token_value)));
+            clustering_restrictions->merge_with(token_restriction);
+
+            if (base_restrictions->get_clustering_columns_restrictions()->prefix_size() > 0) {
+                auto single_ck_restrictions = dynamic_pointer_cast<restrictions::single_column_primary_key_restrictions<clustering_key>>(base_restrictions->get_clustering_columns_restrictions());
+                if (single_ck_restrictions) {
+                    auto prefix_restrictions = single_ck_restrictions->get_longest_prefix_restrictions();
+                    auto clustering_restrictions_from_base = ::make_shared<restrictions::single_column_primary_key_restrictions<clustering_key_prefix>>(view_schema, *prefix_restrictions);
+                    for (auto restriction_it : clustering_restrictions_from_base->restrictions()) {
+                        clustering_restrictions->merge_with(restriction_it.second);
+                    }
+                }
+            }
+
+            partition_slice_builder.with_ranges(clustering_restrictions->bounds_ranges(options));
+        }
+    }
+
+    auto partition_slice = partition_slice_builder.build();
    auto cmd = ::make_lw_shared<query::read_command>(
            view_schema->id(),
            view_schema->version(),
-            partition_slice_builder.build(),
+            partition_slice,
            limit,
            now,
            tracing::make_trace_info(state.get_trace_state()),
            query::max_partitions,
            utils::UUID(),
            options.get_timestamp(state));
-    return proxy.query(view_schema,
-            cmd,
-            std::move(partition_ranges),
-            options.get_consistency(),
-            {timeout, state.get_trace_state()});
+
+    std::vector<const column_definition*> columns;
+    for (const column_definition& cdef : base_schema->partition_key_columns()) {
+        columns.emplace_back(view_schema->get_column_definition(cdef.name()));
+    }
+    if constexpr (std::is_same_v<KeyType, clustering_key>) {
+        for (const column_definition& cdef : base_schema->clustering_key_columns()) {
+            columns.emplace_back(view_schema->get_column_definition(cdef.name()));
+        }
+    }
+    auto selection = selection::selection::for_columns(view_schema, columns);
+
+    int32_t page_size = options.get_page_size();
+    if (page_size <= 0 || !service::pager::query_pagers::may_need_paging(*view_schema, page_size, *cmd, partition_ranges)) {
+        stats.unpaged_select_queries += 1;
+        return proxy.query(view_schema, cmd, std::move(partition_ranges), options.get_consistency(), {timeout, state.get_trace_state()})
+        .then([base_schema, view_schema, now, &options, selection = std::move(selection), partition_slice = std::move(partition_slice)] (service::storage_proxy::coordinator_query_result qr) {
+            cql3::selection::result_set_builder builder(*selection, now, options.get_cql_serialization_format());
+            query::result_view::consume(*qr.query_result,
+                                        std::move(partition_slice),
+                                        cql3::selection::result_set_builder::visitor(builder, *view_schema, *selection));
+            return ::make_shared<cql_transport::messages::result_message::rows>(std::move(result(builder.build())));
+        });
+    }
+
+    auto p = service::pager::query_pagers::pager(view_schema, selection,
+            state, options, cmd, std::move(partition_ranges), stats, nullptr);
+    return p->fetch_page(options.get_page_size(), now, timeout).then([p, &options, limit, now] (std::unique_ptr<cql3::result_set> rs) {
+        rs->get_metadata().set_paging_state(p->state());
+        return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
+    });
 }

 // Note: the partitions keys returned by this function are sorted
 // in token order. See issue #3423.
-future<dht::partition_range_vector>
+future<dht::partition_range_vector, ::shared_ptr<const service::pager::paging_state>>
 indexed_table_select_statement::find_index_partition_ranges(service::storage_proxy& proxy,
                                             service::query_state& state,
                                             const query_options& options)
 {
-    schema_ptr view = get_index_schema(proxy, _index, _schema, state.get_trace_state());
    auto now = gc_clock::now();
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
-    return read_posting_list(proxy, view, _restrictions->index_restrictions(), options, get_limit(options), state, now, timeout).then(
-            [this, now, &options, view] (service::storage_proxy::coordinator_query_result qr) {
-        std::vector<const column_definition*> columns;
-        for (const column_definition& cdef : _schema->partition_key_columns()) {
-            columns.emplace_back(view->get_column_definition(cdef.name()));
-        }
-        auto selection = selection::selection::for_columns(view, columns);
-        cql3::selection::result_set_builder builder(*selection, now, options.get_cql_serialization_format());
-        // FIXME: read_posting_list already asks to read primary keys only.
-        // why do we need to specify this again?
-        auto slice = partition_slice_builder(*view).build();
-        query::result_view::consume(*qr.query_result,
-                                    slice,
-                                    cql3::selection::result_set_builder::visitor(builder, *view, *selection));
-        auto rs = cql3::untyped_result_set(::make_shared<cql_transport::messages::result_message::rows>(std::move(result(builder.build()))));
+    return read_posting_list<partition_key>(proxy, _view_schema, _schema, _index, _restrictions, options, get_limit(options), state, now, timeout, _stats).then(
+            [this, now, &options] (::shared_ptr<cql_transport::messages::result_message::rows> rows) {
+        auto rs = cql3::untyped_result_set(rows);
        dht::partition_range_vector partition_ranges;
        partition_ranges.reserve(rs.size());
        // We are reading the list of primary keys as rows of a single
@@ -842,36 +1166,22 @@ indexed_table_select_statement::find_index_partition_ranges(service::storage_pro
            auto range = dht::partition_range::make_singular(dk);
            partition_ranges.emplace_back(range);
        }
-        return partition_ranges;
+        auto paging_state = rows->rs().get_metadata().paging_state();
+        return make_ready_future<dht::partition_range_vector, ::shared_ptr<const service::pager::paging_state>>(std::move(partition_ranges), std::move(paging_state));
    });
 }

 // Note: the partitions keys returned by this function are sorted
 // in token order. See issue #3423.
-future<std::vector<indexed_table_select_statement::primary_key>>
+future<std::vector<indexed_table_select_statement::primary_key>, ::shared_ptr<const service::pager::paging_state>>
 indexed_table_select_statement::find_index_clustering_rows(service::storage_proxy& proxy, service::query_state& state, const query_options& options)
 {
-    schema_ptr view = get_index_schema(proxy, _index, _schema, state.get_trace_state());
    auto now = gc_clock::now();
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
-    return read_posting_list(proxy, view, _restrictions->index_restrictions(), options, get_limit(options), state, now, timeout).then(
-            [this, now, &options, view] (service::storage_proxy::coordinator_query_result qr) {
-        std::vector<const column_definition*> columns;
-        for (const column_definition& cdef : _schema->partition_key_columns()) {
-            columns.emplace_back(view->get_column_definition(cdef.name()));
-        }
-        for (const column_definition& cdef : _schema->clustering_key_columns()) {
-            columns.emplace_back(view->get_column_definition(cdef.name()));
-        }
-        auto selection = selection::selection::for_columns(view, columns);
-        cql3::selection::result_set_builder builder(*selection, now, options.get_cql_serialization_format());
-        // FIXME: read_posting_list already asks to read primary keys only.
-        // why do we need to specify this again?
-        auto slice = partition_slice_builder(*view).build();
-        query::result_view::consume(*qr.query_result,
-                                    slice,
-                                    cql3::selection::result_set_builder::visitor(builder, *view, *selection));
-        auto rs = cql3::untyped_result_set(::make_shared<cql_transport::messages::result_message::rows>(result(builder.build())));
+    return read_posting_list<clustering_key>(proxy, _view_schema, _schema, _index, _restrictions, options, get_limit(options), state, now, timeout, _stats).then(
+            [this, now, &options] (::shared_ptr<cql_transport::messages::result_message::rows> rows) {
+
+        auto rs = cql3::untyped_result_set(rows);
        std::vector<primary_key> primary_keys;
        primary_keys.reserve(rs.size());
        for (size_t i = 0; i < rs.size(); i++) {
@@ -887,7 +1197,8 @@ indexed_table_select_statement::find_index_clustering_rows(service::storage_prox
            auto ck = clustering_key::from_range(ck_columns);
            primary_keys.emplace_back(primary_key{std::move(dk), std::move(ck)});
        }
-        return primary_keys;
+        auto paging_state = rows->rs().get_metadata().paging_state();
+        return make_ready_future<std::vector<indexed_table_select_statement::primary_key>, ::shared_ptr<const service::pager::paging_state>>(std::move(primary_keys), std::move(paging_state));
    });
 }

@@ -953,7 +1264,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
                     ? selection::selection::wildcard(schema)
                     : selection::selection::from_selectors(db, schema, _select_clause);

-    auto restrictions = prepare_restrictions(db, schema, bound_names, selection, for_view);
+    auto restrictions = prepare_restrictions(db, schema, bound_names, selection, for_view, _parameters->allow_filtering());

    if (_parameters->is_distinct()) {
        validate_distinct_selection(schema, selection, restrictions);
@@ -970,10 +1281,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
    }

    check_needs_filtering(restrictions);
-    size_t restrictions_size = restrictions->get_partition_key_restrictions()->size() + restrictions->get_clustering_columns_restrictions()->size() + restrictions->get_non_pk_restriction().size();
-    if (restrictions->uses_secondary_indexing() && restrictions_size > 1) {
-        throw exceptions::invalid_request_exception("Indexed query may not contain multiple restrictions in 2.3");
-    }
+    ensure_filtering_columns_retrieval(db, selection, restrictions);

    ::shared_ptr<cql3::statements::select_statement> stmt;
    if (restrictions->uses_secondary_indexing()) {
@@ -1011,13 +1319,14 @@ select_statement::prepare_restrictions(database& db,
                                       schema_ptr schema,
                                       ::shared_ptr<variable_specifications> bound_names,
                                       ::shared_ptr<selection::selection> selection,
-                                       bool for_view)
+                                       bool for_view,
+                                       bool allow_filtering)
 {
    try {
        // FIXME: this method should take a separate allow_filtering parameter
        // and pass it on. Currently we pass "for_view" as allow_filtering.
        return ::make_shared<restrictions::statement_restrictions>(db, schema, statement_type::SELECT, std::move(_where_clause), bound_names,
-            selection->contains_only_static_columns(), selection->contains_a_collection(), for_view, for_view);
+            selection->contains_only_static_columns(), selection->contains_a_collection(), for_view, allow_filtering);
    } catch (const exceptions::unrecognized_entity_exception& e) {
        if (contains_alias(e.entity)) {
            throw exceptions::invalid_request_exception(sprint("Aliases aren't allowed in the where clause ('%s')", e.relation->to_string()));
@@ -1111,7 +1420,7 @@ select_statement::get_ordering_comparator(schema_ptr schema,
        }
        auto index = selection->index_of(*def);
        if (index < 0) {
-            index = selection->add_column_for_ordering(*def);
+            index = selection->add_column_for_post_processing(*def);
        }

        sorters.emplace_back(index, def->type);
@@ -1198,6 +1507,23 @@ void select_statement::check_needs_filtering(::shared_ptr<restrictions::statemen
    }
 }

+/**
+ * Adds columns that are needed for the purpose of filtering to the selection.
+ * The columns that are added to the selection are columns that
+ * are needed for filtering on the coordinator but are not part of the selection.
+ * The columns are added with a meta-data indicating they are not to be returned
+ * to the user.
+ */
+void select_statement::ensure_filtering_columns_retrieval(database& db,
+                                        ::shared_ptr<selection::selection> selection,
+                                        ::shared_ptr<restrictions::statement_restrictions> restrictions) {
+    for (auto&& cdef : restrictions->get_column_defs_for_filtering(db)) {
+        if (!selection->has_column(*cdef)) {
+            selection->add_column_for_post_processing(*cdef);
+        }
+    }
+}
+
 bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
    return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
        return raw->alias && *name == *raw->alias;
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -67,8 +67,8 @@ class select_statement : public cql_statement {
 public:
    using parameters = raw::select_statement::parameters;
    using ordering_comparator_type = raw::select_statement::ordering_comparator_type;
-protected:
    static constexpr int DEFAULT_COUNT_PAGE_SIZE = 10000;
+protected:
    static thread_local const ::shared_ptr<parameters> _default_parameters;
    schema_ptr _schema;
    uint32_t _bound_terms;
@@ -126,14 +126,6 @@ public:
        clustering_key_prefix clustering;
    };

-    future<::shared_ptr<cql_transport::messages::result_message>> execute(
-            service::storage_proxy& proxy,
-            lw_shared_ptr<query::read_command> cmd,
-            std::vector<primary_key>&& primary_keys,
-            service::query_state& state,
-            const query_options& options,
-            gc_clock::time_point now);
-
    shared_ptr<cql_transport::messages::result_message> process_results(foreign_ptr<lw_shared_ptr<query::result>> results,
        lw_shared_ptr<query::read_command> cmd, const query_options& options, gc_clock::time_point now);

@@ -168,6 +160,7 @@ public:

 class indexed_table_select_statement : public select_statement {
    secondary_index::index _index;
+    schema_ptr _view_schema;
 public:
    static ::shared_ptr<cql3::statements::select_statement> prepare(database& db,
                                                                    schema_ptr schema,
@@ -189,24 +182,80 @@ public:
                                   ordering_comparator_type ordering_comparator,
                                   ::shared_ptr<term> limit,
                                   cql_stats &stats,
-                                   const secondary_index::index& index);
+                                   const secondary_index::index& index,
+                                   schema_ptr view_schema);

 private:
-    static stdx::optional<secondary_index::index> find_idx(database& db,
-                                                           schema_ptr schema,
-                                                           ::shared_ptr<restrictions::statement_restrictions> restrictions);
-
    virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
                                                                                     service::query_state& state, const query_options& options) override;

-    future<dht::partition_range_vector> find_index_partition_ranges(service::storage_proxy& proxy,
+    ::shared_ptr<const service::pager::paging_state> generate_view_paging_state_from_base_query_results(::shared_ptr<const service::pager::paging_state> paging_state,
+            const foreign_ptr<lw_shared_ptr<query::result>>& results, service::storage_proxy& proxy, service::query_state& state, const query_options& options) const;
+
+    future<dht::partition_range_vector, ::shared_ptr<const service::pager::paging_state>> find_index_partition_ranges(service::storage_proxy& proxy,
                                                                    service::query_state& state,
                                                                    const query_options& options);

-    future<std::vector<primary_key>> find_index_clustering_rows(service::storage_proxy& proxy,
+    future<std::vector<primary_key>, ::shared_ptr<const service::pager::paging_state>> find_index_clustering_rows(service::storage_proxy& proxy,
                                                                service::query_state& state,
                                                                const query_options& options);

+    shared_ptr<cql_transport::messages::result_message>
+    process_base_query_results(
+            foreign_ptr<lw_shared_ptr<query::result>> results,
+            lw_shared_ptr<query::read_command> cmd,
+            service::storage_proxy& proxy,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
+
+    lw_shared_ptr<query::read_command>
+    prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);
+
+    future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+    do_execute_base_query(
+            service::storage_proxy& proxy,
+            dht::partition_range_vector&& partition_ranges,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
+    future<shared_ptr<cql_transport::messages::result_message>>
+    execute_base_query(
+            service::storage_proxy& proxy,
+            dht::partition_range_vector&& partition_ranges,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
+
+    // Function for fetching the selected columns from a list of clustering rows.
+    // It is currently used only in our Secondary Index implementation - ordinary
+    // CQL SELECT statements do not have the syntax to request a list of rows.
+    // FIXME: The current implementation is very inefficient - it requests each
+    // row separately (and, incrementally, in parallel). Even multiple rows from a single
+    // partition are requested separately. This last case can be easily improved,
+    // but to implement the general case (multiple rows from multiple partitions)
+    // efficiently, we will need more support from other layers.
+    // Keys are ordered in token order (see #3423)
+    future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+    do_execute_base_query(
+            service::storage_proxy& proxy,
+            std::vector<primary_key>&& primary_keys,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
+    future<shared_ptr<cql_transport::messages::result_message>>
+    execute_base_query(
+            service::storage_proxy& proxy,
+            std::vector<primary_key>&& primary_keys,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
+
    virtual void update_stats_rows_read(int64_t rows_read) override {
        _stats.rows_read += rows_read;
        _stats.secondary_index_rows_read += rows_read;
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -84,8 +84,11 @@ parse(const sstring& json_string, const std::vector<column_definition>& expected
    for (const auto& def : expected_receivers) {
        sstring cql_name = def.name_as_text();
        auto value_it = prepared_map.find(cql_name);
-        if (value_it == prepared_map.end() || value_it->second.isNull()) {
+        if (value_it == prepared_map.end()) {
+            continue;
+        } else if (value_it->second.isNull()) {
            json_map.emplace(std::move(cql_name), bytes_opt{});
+            prepared_map.erase(value_it);
        } else {
            json_map.emplace(std::move(cql_name), def.type->from_json_object(value_it->second, sf));
            prepared_map.erase(value_it);
@@ -172,7 +175,9 @@ void update_statement::add_update_for_key(mutation& m, const query::clustering_r
 }

 modification_statement::json_cache_opt insert_prepared_json_statement::maybe_prepare_json_cache(const query_options& options) {
-    sstring json_string = utf8_type->to_string(_term->bind_and_get(options).data().value().to_string());
+    sstring json_string = with_linearized(_term->bind_and_get(options).data().value(), [&] (bytes_view value) {
+        return utf8_type->to_string(value.to_string());
+    });
    return json_helpers::parse(std::move(json_string), s->all_columns(), options.get_cql_serialization_format());
 }

@@ -195,20 +200,20 @@ insert_prepared_json_statement::execute_set_value(mutation& m, const clustering_
        m.set_cell(prefix, column, std::move(operation::make_dead_cell(params)));
        return;
    } else if (!column.type->is_collection()) {
-        constants::setter::execute(m, prefix, params, column, raw_value_view::make_value(bytes_view(*value)));
+        constants::setter::execute(m, prefix, params, column, raw_value_view::make_value(fragmented_temporary_buffer::view(*value)));
        return;
    }

    auto& k = static_pointer_cast<const collection_type_impl>(column.type)->_kind;
    cql_serialization_format sf = params._options.get_cql_serialization_format();
    if (&k == &collection_type_impl::kind::list) {
-        auto list_terminal = make_shared<lists::value>(lists::value::from_serialized(*value, dynamic_pointer_cast<const list_type_impl>(column.type), sf));
+        auto list_terminal = make_shared<lists::value>(lists::value::from_serialized(fragmented_temporary_buffer::view(*value), dynamic_pointer_cast<const list_type_impl>(column.type), sf));
        lists::setter::execute(m, prefix, params, column, std::move(list_terminal));
    } else if (&k == &collection_type_impl::kind::set) {
-        auto set_terminal = make_shared<sets::value>(sets::value::from_serialized(*value, dynamic_pointer_cast<const set_type_impl>(column.type), sf));
+        auto set_terminal = make_shared<sets::value>(sets::value::from_serialized(fragmented_temporary_buffer::view(*value), dynamic_pointer_cast<const set_type_impl>(column.type), sf));
        sets::setter::execute(m, prefix, params, column, std::move(set_terminal));
    } else if (&k == &collection_type_impl::kind::map) {
-        auto map_terminal = make_shared<maps::value>(maps::value::from_serialized(*value, dynamic_pointer_cast<const map_type_impl>(column.type), sf));
+        auto map_terminal = make_shared<maps::value>(maps::value::from_serialized(fragmented_temporary_buffer::view(*value), dynamic_pointer_cast<const map_type_impl>(column.type), sf));
        maps::setter::execute(m, prefix, params, column, std::move(map_terminal));
    } else {
        throw exceptions::invalid_request_exception("Incorrect value kind in JSON INSERT statement");
@@ -253,8 +258,12 @@ void insert_prepared_json_statement::execute_operations_for_key(mutation& m, con
            throw exceptions::invalid_request_exception(sprint("Cannot set the value of counter column %s in JSON", def.name_as_text()));
        }

-        auto value = json_cache->at(def.name_as_text());
-        execute_set_value(m, prefix, params, def, value);
+        auto it = json_cache->find(def.name_as_text());
+        if (it != json_cache->end()) {
+            execute_set_value(m, prefix, params, def, it->second);
+        } else if (!_default_unset) {
+            execute_set_value(m, prefix, params, def, bytes_opt{});
+        }
    }
 }

@@ -320,12 +329,14 @@ insert_statement::prepare_internal(database& db, schema_ptr schema,
 insert_json_statement::insert_json_statement(  ::shared_ptr<cf_name> name,
                                               ::shared_ptr<attributes::raw> attrs,
                                               ::shared_ptr<term::raw> json_value,
-                                               bool if_not_exists)
+                                               bool if_not_exists,
+                                               bool default_unset)
    : raw::modification_statement{name, attrs, conditions_vector{}, if_not_exists, false}
    , _name(name)
    , _attrs(attrs)
    , _json_value(json_value)
-    , _if_not_exists(if_not_exists) { }
+    , _if_not_exists(if_not_exists)
+    , _default_unset(default_unset) { }

 ::shared_ptr<cql3::statements::modification_statement>
 insert_json_statement::prepare_internal(database& db, schema_ptr schema,
@@ -335,7 +346,7 @@ insert_json_statement::prepare_internal(database& db, schema_ptr schema,
    auto json_column_placeholder = ::make_shared<column_identifier>("", true);
    auto prepared_json_value = _json_value->prepare(db, "", ::make_shared<column_specification>("", "", json_column_placeholder, utf8_type));
    prepared_json_value->collect_marker_specification(bound_names);
-    return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value));
+    return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value), _default_unset);
 }

 update_statement::update_statement(            ::shared_ptr<cf_name> name,
--- a/cql3/statements/update_statement.hh
+++ b/cql3/statements/update_statement.hh
@@ -82,9 +82,10 @@ private:
 */
 class insert_prepared_json_statement : public update_statement {
    ::shared_ptr<term> _term;
+    bool _default_unset;
 public:
-    insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t)
-        : update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t) {
+    insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t, bool default_unset)
+        : update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t), _default_unset(default_unset) {
        _restrictions = ::make_shared<restrictions::statement_restrictions>(s, false);
    }
 private:
--- a/cql3/stats.hh
+++ b/cql3/stats.hh
@@ -36,11 +36,17 @@ struct cql_stats {
    uint64_t batches_pure_unlogged = 0;
    uint64_t batches_unlogged_from_logged = 0;
    uint64_t rows_read = 0;
+    uint64_t reverse_queries = 0;
+    uint64_t unpaged_select_queries = 0;

    int64_t secondary_index_creates = 0;
    int64_t secondary_index_drops = 0;
    int64_t secondary_index_reads = 0;
    int64_t secondary_index_rows_read = 0;
+
+    int64_t filtered_reads = 0;
+    int64_t filtered_rows_matched_total = 0;
+    int64_t filtered_rows_read_total = 0;
 };

 }
--- a/cql3/tuples.hh
+++ b/cql3/tuples.hh
@@ -54,7 +54,7 @@ public:
                column->ks_name,
                column->cf_name,
                ::make_shared<column_identifier>(sprint("%s[%d]", column->name, component), true),
-                static_pointer_cast<const tuple_type_impl>(column->type)->type(component));
+                static_pointer_cast<const tuple_type_impl>(column->type->underlying_type())->type(component));
    }

    /**
@@ -112,7 +112,7 @@ public:

    private:
        void validate_assignable_to(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) {
-            auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type);
+            auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type->underlying_type());
            if (!tt) {
                throw exceptions::invalid_request_exception(sprint("Invalid tuple type literal for %s of type %s", receiver->name, receiver->type->as_cql3_type()));
            }
@@ -159,8 +159,10 @@ public:
                _elements.push_back(e ? bytes_opt(bytes(e->begin(), e->size())) : bytes_opt());
            }
        }
-        static value from_serialized(bytes_view buffer, tuple_type type) {
-            return value(type->split(buffer));
+        static value from_serialized(const fragmented_temporary_buffer::view& buffer, tuple_type type) {
+          return with_linearized(buffer, [&] (bytes_view view) {
+            return value(type->split(view));
+          });
        }
        virtual cql3::raw_value get(const query_options& options) override {
            return cql3::raw_value::make_value(tuple_type_impl::build_value(_elements));
@@ -251,20 +253,29 @@ public:
            }
        }

-        static in_value from_serialized(bytes_view value, list_type type, const query_options& options) {
+        static in_value from_serialized(const fragmented_temporary_buffer::view& value_view, list_type type, const query_options& options) {
            try {
                // Collections have this small hack that validate cannot be called on a serialized object,
                // but the deserialization does the validation (so we're fine).
+              return with_linearized(value_view, [&] (bytes_view value) {
                auto l = value_cast<list_type_impl::native_type>(type->deserialize(value, options.get_cql_serialization_format()));
                auto ttype = dynamic_pointer_cast<const tuple_type_impl>(type->get_elements_type());
                assert(ttype);

-                std::vector<std::vector<bytes_view_opt>> elements;
+                std::vector<std::vector<bytes_opt>> elements;
                elements.reserve(l.size());
                for (auto&& element : l) {
-                    elements.emplace_back(ttype->split(ttype->decompose(element)));
+                    auto tuple_buff = ttype->decompose(element);
+                    auto tuple = ttype->split(tuple_buff);
+                    std::vector<bytes_opt> elems;
+                    elems.reserve(tuple.size());
+                    for (auto&& e : tuple) {
+                        elems.emplace_back(to_bytes_opt(e));
+                    }
+                    elements.emplace_back(std::move(elems));
                }
                return in_value(elements);
+              });
            } catch (marshal_exception& e) {
                throw exceptions::invalid_request_exception(e.what());
            }
--- a/cql3/update_parameters.hh
+++ b/cql3/update_parameters.hh
@@ -142,7 +142,7 @@ public:
        return atomic_cell::make_dead(_timestamp, _local_deletion_time);
    }

-    atomic_cell make_cell(const abstract_type& type, bytes_view value, atomic_cell::collection_member cm = atomic_cell::collection_member::no) const {
+    atomic_cell make_cell(const abstract_type& type, const fragmented_temporary_buffer::view& value, atomic_cell::collection_member cm = atomic_cell::collection_member::no) const {
        auto ttl = _ttl;

        if (ttl.count() <= 0) {
@@ -156,6 +156,10 @@ public:
        }
    };

+    atomic_cell make_cell(const abstract_type& type, bytes_view value, atomic_cell::collection_member cm = atomic_cell::collection_member::no) const {
+        return make_cell(type, fragmented_temporary_buffer::view(value), cm);
+    }
+
    atomic_cell make_counter_update_cell(int64_t delta) const {
        return atomic_cell::make_live_counter_update(_timestamp, delta);
    }
--- a/cql3/values.hh
+++ b/cql3/values.hh
@@ -28,6 +28,10 @@

 #include <experimental/optional>

+#include <seastar/util/variant_utils.hh>
+
+#include "utils/fragmented_temporary_buffer.hh"
+
 namespace cql3 {

 struct null_value {
@@ -40,7 +44,7 @@ struct unset_value {
 ///
 /// \see raw_value
 struct raw_value_view {
-    boost::variant<bytes_view, null_value, unset_value> _data;
+    boost::variant<fragmented_temporary_buffer::view, null_value, unset_value> _data;

    raw_value_view(null_value&& data)
        : _data{std::move(data)}
@@ -48,10 +52,7 @@ struct raw_value_view {
    raw_value_view(unset_value&& data)
        : _data{std::move(data)}
    {}
-    raw_value_view(bytes_view&& data)
-        : _data{std::move(data)}
-    {}
-    raw_value_view(const bytes_view& data)
+    raw_value_view(fragmented_temporary_buffer::view data)
        : _data{data}
    {}
 public:
@@ -61,10 +62,7 @@ public:
    static raw_value_view make_unset_value() {
        return raw_value_view{std::move(unset_value{})};
    }
-    static raw_value_view make_value(bytes_view &&view) {
-        return raw_value_view{std::move(view)};
-    }
-    static raw_value_view make_value(const bytes_view& view) {
+    static raw_value_view make_value(fragmented_temporary_buffer::view view) {
        return raw_value_view{view};
    }
    bool is_null() const {
@@ -76,20 +74,47 @@ public:
    bool is_value() const {
        return _data.which() == 0;
    }
-    bytes_view_opt data() const {
+    std::optional<fragmented_temporary_buffer::view> data() const {
        if (_data.which() == 0) {
-            return boost::get<bytes_view>(_data);
+            return boost::get<fragmented_temporary_buffer::view>(_data);
        }
        return {};
    }
    explicit operator bool() const {
        return _data.which() == 0;
    }
-    const bytes_view* operator->() const {
-        return &boost::get<bytes_view>(_data);
+    const fragmented_temporary_buffer::view* operator->() const {
+        return &boost::get<fragmented_temporary_buffer::view>(_data);
    }
-    const bytes_view& operator*() const {
-        return boost::get<bytes_view>(_data);
+    const fragmented_temporary_buffer::view& operator*() const {
+        return boost::get<fragmented_temporary_buffer::view>(_data);
+    }
+
+    bool operator==(const raw_value_view& other) const {
+        if (_data.which() != other._data.which()) {
+            return false;
+        }
+        if (is_value() && **this != *other) {
+            return false;
+        }
+        return true;
+    }
+    bool operator!=(const raw_value_view& other) const {
+        return !(*this == other);
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const raw_value_view& value) {
+        seastar::visit(value._data, [&] (fragmented_temporary_buffer::view v) {
+            os << "{ value: ";
+            using boost::range::for_each;
+            for_each(v, [&os] (bytes_view bv) { os << bv; });
+            os << " }";
+        }, [&] (null_value) {
+            os << "{ null }";
+        }, [&] (unset_value) {
+            os << "{ unset }";
+        });
+        return os;
    }
 };

@@ -127,7 +152,7 @@ public:
        if (view.is_unset_value()) {
            return make_unset_value();
        }
-        return make_value(to_bytes(*view));
+        return make_value(linearized(*view));
    }
    static raw_value make_value(bytes&& bytes) {
        return raw_value{std::move(bytes)};
@@ -167,7 +192,7 @@ public:
    }
    raw_value_view to_view() const {
        switch (_data.which()) {
-        case 0:  return raw_value_view::make_value(bytes_view{boost::get<bytes>(_data)});
+        case 0:  return raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{boost::get<bytes>(_data)}));
        case 1:  return raw_value_view::make_null();
        default: return raw_value_view::make_unset_value();
        }
@@ -176,10 +201,19 @@ public:

 }

+inline bytes to_bytes(const cql3::raw_value_view& view)
+{
+    return linearized(*view);
+}
+
 inline bytes_opt to_bytes_opt(const cql3::raw_value_view& view) {
-    return to_bytes_opt(view.data());
+    auto buffer_view = view.data();
+    if (buffer_view) {
+        return bytes_opt(linearized(*buffer_view));
+    }
+    return bytes_opt();
 }

 inline bytes_opt to_bytes_opt(const cql3::raw_value& value) {
-    return value.data();
+    return to_bytes_opt(value.to_view());
 }
--- a/database.cc
+++ b/database.cc
--- a/database.hh
+++ b/database.hh
@@ -77,6 +77,7 @@
 #include <seastar/core/metrics_registration.hh>
 #include "tracing/trace_state.hh"
 #include "db/view/view.hh"
+#include "db/view/view_update_backlog.hh"
 #include "db/view/row_locking.hh"
 #include "lister.hh"
 #include "utils/phased_barrier.hh"
@@ -164,29 +165,33 @@ private:
    std::function<schema_ptr()> _current_schema;
    dirty_memory_manager* _dirty_memory_manager;
    std::experimental::optional<shared_promise<>> _flush_coalescing;
+    seastar::scheduling_group _compaction_scheduling_group;
 public:
    memtable_list(
            seal_immediate_fn_type seal_immediate_fn,
            seal_delayed_fn_type seal_delayed_fn,
            std::function<schema_ptr()> cs,
-            dirty_memory_manager* dirty_memory_manager)
+            dirty_memory_manager* dirty_memory_manager,
+            seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
        : _memtables({})
        , _seal_immediate_fn(seal_immediate_fn)
        , _seal_delayed_fn(seal_delayed_fn)
        , _current_schema(cs)
-        , _dirty_memory_manager(dirty_memory_manager) {
+        , _dirty_memory_manager(dirty_memory_manager)
+        , _compaction_scheduling_group(compaction_scheduling_group) {
        add_memtable();
    }

    memtable_list(
            seal_immediate_fn_type seal_immediate_fn,
            std::function<schema_ptr()> cs,
-            dirty_memory_manager* dirty_memory_manager)
-        : memtable_list(std::move(seal_immediate_fn), {}, std::move(cs), dirty_memory_manager) {
+            dirty_memory_manager* dirty_memory_manager,
+            seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
+        : memtable_list(std::move(seal_immediate_fn), {}, std::move(cs), dirty_memory_manager, compaction_scheduling_group) {
    }

-    memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
-        : memtable_list({}, {}, std::move(cs), dirty_memory_manager) {
+    memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager, seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
+        : memtable_list({}, {}, std::move(cs), dirty_memory_manager, compaction_scheduling_group) {
    }

    bool may_flush() const {
@@ -275,6 +280,9 @@ struct cf_stats {
    int64_t clustering_filter_fast_path_count = 0;
    // how many sstables survived the clustering key checks
    int64_t surviving_sstables_after_clustering_filter = 0;
+
+    // How many view updates were dropped due to overload.
+    int64_t dropped_view_updates = 0;
 };

 class cache_temperature {
@@ -299,6 +307,7 @@ class database_sstable_write_monitor;
 class table : public enable_lw_shared_from_this<table> {
 public:
    struct config {
+        std::vector<sstring> all_datadirs;
        sstring datadir;
        bool enable_disk_writes = true;
        bool enable_disk_reads = true;
@@ -314,11 +323,13 @@ public:
        seastar::scheduling_group memtable_scheduling_group;
        seastar::scheduling_group memtable_to_cache_scheduling_group;
        seastar::scheduling_group compaction_scheduling_group;
+        seastar::scheduling_group memory_compaction_scheduling_group;
        seastar::scheduling_group statement_scheduling_group;
        seastar::scheduling_group streaming_scheduling_group;
        bool enable_metrics_reporting = false;
        db::large_partition_handler* large_partition_handler;
        db::timeout_semaphore* view_update_concurrency_semaphore;
+        size_t view_update_concurrency_semaphore_limit;
    };
    struct no_commitlog {};
    struct stats {
@@ -428,11 +439,15 @@ private:
    // but for correct compaction we need to start the compaction only after
    // reading all sstables.
    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
+    // sstables that should not be compacted (e.g. because they need to be used
+    // to generate view updates later)
+    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_staging;
    // Control background fibers waiting for sstables to be deleted
    seastar::gate _sstable_deletion_gate;
    // This semaphore ensures that an operation like snapshot won't have its selected
    // sstables deleted by compaction in parallel, a race condition which could
    // easily result in failure.
+    // Locking order: must be acquired either independently or after _sstables_lock
    seastar::semaphore _sstable_deletion_sem = {1};
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
@@ -483,6 +498,13 @@ private:
    utils::phased_barrier _pending_writes_phaser;
    // Corresponding phaser for in-progress reads.
    utils::phased_barrier _pending_reads_phaser;
+public:
+    future<> add_sstable_and_update_cache(sstables::shared_sstable sst);
+    void move_sstable_from_staging_in_thread(sstables::shared_sstable sst);
+    sstables::shared_sstable get_staging_sstable(uint64_t generation) {
+        auto it = _sstables_staging.find(generation);
+        return it != _sstables_staging.end() ? it->second : nullptr;
+    }
 private:
    void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
    // Adds new sstable to the set of sstables
@@ -535,7 +557,7 @@ private:
    void rebuild_statistics();

    // This function replaces new sstables by their ancestors, which are sstables that needed resharding.
-    void replace_ancestors_needed_rewrite(std::vector<sstables::shared_sstable> new_sstables);
+    void replace_ancestors_needed_rewrite(std::unordered_set<uint64_t> ancestors, std::vector<sstables::shared_sstable> new_sstables);
    void remove_ancestors_needed_rewrite(std::unordered_set<uint64_t> ancestors);
 private:
    mutation_source_opt _virtual_reader;
@@ -616,6 +638,14 @@ public:
            tracing::trace_state_ptr trace_state = nullptr,
            streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
+    flat_mutation_reader make_reader_excluding_sstable(schema_ptr schema,
+            sstables::shared_sstable sst,
+            const dht::partition_range& range,
+            const query::partition_slice& slice,
+            const io_priority_class& pc = default_priority_class(),
+            tracing::trace_state_ptr trace_state = nullptr,
+            streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
+            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;

    flat_mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
        auto& full_slice = schema->full_slice();
@@ -630,7 +660,13 @@ public:
    flat_mutation_reader make_streaming_reader(schema_ptr schema,
            const dht::partition_range_vector& ranges) const;

+    sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
+    sstables::shared_sstable make_streaming_staging_sstable() {
+        return make_streaming_sstable_for_write("staging");
+    }
+
    mutation_source as_mutation_source() const;
+    mutation_source as_mutation_source_excluding(sstables::shared_sstable sst) const;

    void set_virtual_reader(mutation_source virtual_reader) {
        _virtual_reader = std::move(virtual_reader);
@@ -684,7 +720,7 @@ public:
        query::result_memory_limiter& memory_limiter,
        uint64_t max_result_size,
        db::timeout_clock::time_point timeout = db::no_timeout,
-        querier_cache_context cache_ctx = { });
+        query::querier_cache_context cache_ctx = { });

    void start();
    future<> stop();
@@ -702,13 +738,7 @@ public:

    // SSTable writes are now allowed again, and generation is updated to new_generation if != -1
    // returns the amount of microseconds elapsed since we disabled writes.
-    std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation) {
-        if (new_generation != -1) {
-            update_sstables_known_generation(new_generation);
-        }
-        _sstables_lock.write_unlock();
-        return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
-    }
+    std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation);

    // Make sure the generation numbers are sequential, starting from "start".
    // Generations before "start" are left untouched.
@@ -838,6 +868,8 @@ public:
    void clear_views();
    const std::vector<view_ptr>& views() const;
    future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const;
+    future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout) const;
+    future<row_locker::lock_holder> stream_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, sstables::shared_sstable excluded_sstable) const;
    void add_coordinator_read_latency(utils::estimated_histogram::duration latency);
    std::chrono::milliseconds get_coordinator_read_latency_percentile(double percentile);

@@ -855,13 +887,17 @@ public:
            dht::token base_token,
            flat_mutation_reader&&);

+    reader_concurrency_semaphore& read_concurrency_semaphore() {
+        return *_config.read_concurrency_semaphore;
+    }
+
 private:
+    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source, const io_priority_class& io_priority) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
            std::vector<view_ptr>&& views,
            mutation&& m,
-            flat_mutation_reader_opt existings,
-            db::timeout_clock::time_point timeout) const;
+            flat_mutation_reader_opt existings) const;

    mutable row_locker _row_locker;
    future<row_locker::lock_holder> local_base_lock(
@@ -1030,6 +1066,7 @@ public:
 class keyspace {
 public:
    struct config {
+        std::vector<sstring> all_datadirs;
        sstring datadir;
        bool enable_commitlog = true;
        bool enable_disk_reads = true;
@@ -1045,10 +1082,12 @@ public:
        seastar::scheduling_group memtable_scheduling_group;
        seastar::scheduling_group memtable_to_cache_scheduling_group;
        seastar::scheduling_group compaction_scheduling_group;
+        seastar::scheduling_group memory_compaction_scheduling_group;
        seastar::scheduling_group statement_scheduling_group;
        seastar::scheduling_group streaming_scheduling_group;
        bool enable_metrics_reporting = false;
        db::timeout_semaphore* view_update_concurrency_semaphore = nullptr;
+        size_t view_update_concurrency_semaphore_limit;
    };
 private:
    std::unique_ptr<locator::abstract_replication_strategy> _replication_strategy;
@@ -1106,6 +1145,7 @@ public:
        return _config.datadir;
    }

+    sstring column_family_directory(const sstring& base_path, const sstring& name, utils::UUID uuid) const;
    sstring column_family_directory(const sstring& name, utils::UUID uuid) const;
 };

@@ -1125,6 +1165,7 @@ struct database_config {
    seastar::scheduling_group memtable_scheduling_group;
    seastar::scheduling_group memtable_to_cache_scheduling_group; // FIXME: merge with memtable_scheduling_group
    seastar::scheduling_group compaction_scheduling_group;
+    seastar::scheduling_group memory_compaction_scheduling_group;
    seastar::scheduling_group statement_scheduling_group;
    seastar::scheduling_group streaming_scheduling_group;
    size_t available_memory;
@@ -1148,6 +1189,7 @@ private:
    static const size_t max_count_system_concurrent_reads{10};
    size_t max_memory_system_concurrent_reads() { return _dbcfg.available_memory * 0.02; };
    static constexpr size_t max_concurrent_sstable_loads() { return 3; }
+    size_t max_memory_pending_view_updates() const { return _dbcfg.available_memory * 0.1; }

    struct db_stats {
        uint64_t total_writes = 0;
@@ -1159,6 +1201,11 @@ private:

        uint64_t short_data_queries = 0;
        uint64_t short_mutation_queries = 0;
+
+        uint64_t multishard_query_unpopped_fragments = 0;
+        uint64_t multishard_query_unpopped_bytes = 0;
+        uint64_t multishard_query_failed_reader_stops = 0;
+        uint64_t multishard_query_failed_reader_saves = 0;
    };

    lw_shared_ptr<db_stats> _stats;
@@ -1179,11 +1226,11 @@ private:

    semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};

-    db::timeout_semaphore _view_update_concurrency_sem{100}; // Stand-in hack for #2538
+    db::timeout_semaphore _view_update_concurrency_sem{max_memory_pending_view_updates()};

    cache_tracker _row_cache_tracker;

-    concrete_execution_stage<future<lw_shared_ptr<query::result>>,
+    inheriting_concrete_execution_stage<future<lw_shared_ptr<query::result>>,
        column_family*,
        schema_ptr,
        const query::read_command&,
@@ -1193,10 +1240,17 @@ private:
        query::result_memory_limiter&,
        uint64_t,
        db::timeout_clock::time_point,
-        querier_cache_context> _data_query_stage;
+        query::querier_cache_context> _data_query_stage;

    mutation_query_stage _mutation_query_stage;

+    inheriting_concrete_execution_stage<
+            future<>,
+            database*,
+            schema_ptr,
+            const frozen_mutation&,
+            db::timeout_clock::time_point> _apply_stage;
+
    std::unordered_map<sstring, keyspace> _keyspaces;
    std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
    std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash> _ks_cf_to_uuid;
@@ -1207,7 +1261,7 @@ private:
    seastar::metrics::metric_groups _metrics;
    bool _enable_incremental_backups = false;

-    querier_cache _querier_cache;
+    query::querier_cache _querier_cache;

    std::unique_ptr<db::large_partition_handler> _large_partition_handler;

@@ -1379,6 +1433,12 @@ public:
    std::unordered_set<sstring> get_initial_tokens();
    std::experimental::optional<gms::inet_address> get_replace_address();
    bool is_replacing();
+    reader_concurrency_semaphore& user_read_concurrency_sem() {
+        return _read_concurrency_sem;
+    }
+    reader_concurrency_semaphore& streaming_read_concurrency_sem() {
+        return _streaming_concurrency_sem;
+    }
    reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
        return _system_read_concurrency_sem;
    }
@@ -1395,15 +1455,25 @@ public:
        _querier_cache.set_entry_ttl(entry_ttl);
    }

-    const querier_cache::stats& get_querier_cache_stats() const {
+    const query::querier_cache::stats& get_querier_cache_stats() const {
        return _querier_cache.get_stats();
    }

+    query::querier_cache& get_querier_cache() {
+        return _querier_cache;
+    }
+
+    db::view::update_backlog get_view_update_backlog() const {
+        return {max_memory_pending_view_updates() - _view_update_concurrency_sem.current(), max_memory_pending_view_updates()};
+    }
+
    friend class distributed_loader;
 };

 future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy);

+bool is_internal_keyspace(const sstring& name);
+
 class distributed_loader {
 public:
    static void reshard(distributed<database>& db, sstring ks_name, sstring cf_name);
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -76,8 +76,7 @@ const uint32_t db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

 db::batchlog_manager::batchlog_manager(cql3::query_processor& qp)
-        : _qp(qp)
-        , _e1(_rd()) {
+        : _qp(qp) {
    namespace sm = seastar::metrics;

    _metrics.add_group("batchlog_manager", {
@@ -117,7 +116,7 @@ future<> db::batchlog_manager::start() {
    // round-robin scheduling.
    if (engine().cpu_id() == 0) {
        _timer.set_callback([this] {
-            return do_batch_log_replay().handle_exception([] (auto ep) {
+            do_batch_log_replay().handle_exception([] (auto ep) {
                blogger.error("Exception in batch replay: {}", ep);
            }).finally([this] {
                _timer.arm(lowres_clock::now() + std::chrono::milliseconds(replay_interval));
@@ -268,7 +267,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
                // send to partially or wholly fail in actually sending stuff. Since we don't
                // have hints (yet), send with CL=ALL, and hope we can re-do this soon.
                // See below, we use retry on write failure.
-                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, nullptr);
+                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, db::no_timeout, nullptr);
            });
        }).then_wrapped([this, id](future<> batch_result) {
            try {
@@ -396,10 +395,8 @@ std::unordered_set<gms::inet_address> db::batchlog_manager::endpoint_filter(cons

    // grab a random member of up to two racks
    for (auto& rack : racks) {
-        auto rack_members = validated.bucket(rack);
-        auto n = validated.bucket_size(rack_members);
        auto cpy = boost::copy_range<std::vector<gms::inet_address>>(validated.equal_range(rack) | boost::adaptors::map_values);
-        std::uniform_int_distribution<size_t> rdist(0, n - 1);
+        std::uniform_int_distribution<size_t> rdist(0, cpy.size() - 1);
        result.emplace(cpy[rdist(_e1)]);
    }

--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -75,8 +75,7 @@ private:
    unsigned _cpu = 0;
    bool _stop = false;

-    std::random_device _rd;
-    std::default_random_engine _e1;
+    std::default_random_engine _e1{std::random_device{}()};

    future<> replay_all_failed_batches();
 public:
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -107,6 +107,11 @@ public:
    void process_bytes(const char* data, size_t size) {
        return _c.process(reinterpret_cast<const uint8_t*>(data), size);
    }
+    template<typename FragmentedBuffer>
+    GCC6_CONCEPT(requires FragmentRange<FragmentedBuffer>)
+    void process_fragmented(const FragmentedBuffer& buffer) {
+        return _c.process_fragmented(buffer);
+    }
 };

 class db::cf_holder {
@@ -308,10 +313,9 @@ public:
    uint64_t get_num_dirty_segments() const;
    uint64_t get_num_active_segments() const;

-    using buffer_type = temporary_buffer<char>;
+    using buffer_type = fragmented_temporary_buffer;

    buffer_type acquire_buffer(size_t s);
-    void release_buffer(buffer_type&&);

    future<std::vector<descriptor>> list_descriptors(sstring dir);

@@ -333,7 +337,6 @@ private:
    segment_id_type _ids = 0;
    std::vector<sseg_ptr> _segments;
    queue<sseg_ptr> _reserve_segments;
-    std::vector<buffer_type> _temp_buffers;
    std::unordered_map<flush_handler_id, flush_handler> _flush_handlers;
    flush_handler_id _flush_ids = 0;
    replay_position _flush_position;
@@ -344,6 +347,12 @@ private:
    uint64_t _new_counter = 0;
 };

+template<typename T, typename Output>
+static void write(Output& out, T value) {
+    auto v = net::hton(value);
+    out.write(reinterpret_cast<const char*>(&v), sizeof(v));
+}
+
 /*
 * A single commit log file on disk. Manages creation of the file and writing mutations to disk,
 * as well as tracking the last mutation position of any "dirty" CFs covered by the segment file. Segment
@@ -398,7 +407,6 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c

    uint64_t _file_pos = 0;
    uint64_t _flush_pos = 0;
-    uint64_t _buf_pos = 0;
    bool _closed = false;

    using buffer_type = segment_manager::buffer_type;
@@ -407,6 +415,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    using time_point = segment_manager::time_point;

    buffer_type _buffer;
+    fragmented_temporary_buffer::ostream _buffer_ostream;
    std::unordered_map<cf_id_type, uint64_t> _cf_dirty;
    time_point _sync_time;
    seastar::gate _gate;
@@ -420,6 +429,10 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    friend std::ostream& operator<<(std::ostream&, const segment&);
    friend class segment_manager;

+    size_t buffer_position() const {
+        return _buffer.size_bytes() - _buffer_ostream.size();
+    }
+
    future<> begin_flush() {
        // This is maintaining the semantica of only using the write-lock
        // as a gate for flushing, i.e. once we've begun a flush for position X
@@ -466,7 +479,7 @@ public:
            clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
            ++_segment_manager->totals.segments_destroyed;
            _segment_manager->totals.total_size_on_disk -= size_on_disk();
-            _segment_manager->totals.total_size -= (size_on_disk() + _buffer.size());
+            _segment_manager->totals.total_size -= (size_on_disk() + _buffer.size_bytes());
            _segment_manager->add_file_to_delete(_file_name, _desc);
        } else {
            clogger.warn("Segment {} is dirty and is left on disk.", *this);
@@ -607,29 +620,16 @@ public:
        auto a = align_up(s + overhead, alignment);
        auto k = std::max(a, default_size);

-        for (;;) {
-            try {
-                _buffer = _segment_manager->acquire_buffer(k);
-                break;
-            } catch (std::bad_alloc&) {
-                clogger.warn("Could not allocate {} k bytes output buffer ({} k required)", k / 1024, a / 1024);
-                if (k > a) {
-                    k = std::max(a, k / 2);
-                    clogger.debug("Trying reduced size: {} k", k / 1024);
-                    continue;
-                }
-                throw;
-            }
-        }
-        _buf_pos = overhead;
-        auto * p = reinterpret_cast<uint32_t *>(_buffer.get_write());
-        std::fill(p, p + overhead, 0);
+        _buffer = _segment_manager->acquire_buffer(k);
+        _buffer_ostream = _buffer.get_ostream();
+        auto out = _buffer_ostream.write_substream(overhead);
+        out.fill('\0', overhead);
        _segment_manager->totals.total_size += k;
    }

    bool buffer_is_empty() const {
-        return _buf_pos <= segment_overhead_size
-                        || (_file_pos == 0 && _buf_pos <= (segment_overhead_size + descriptor_header_size));
+        return buffer_position() <= segment_overhead_size
+                        || (_file_pos == 0 && buffer_position() <= (segment_overhead_size + descriptor_header_size));
    }
    /**
     * Send any buffer contents to disk and get a new tmp buffer
@@ -641,35 +641,32 @@ public:
        }

        auto size = clear_buffer_slack();
-        auto buf = std::move(_buffer);
+        auto buf = std::exchange(_buffer, { });
        auto off = _file_pos;
        auto top = off + size;
        auto num = _num_allocs;

        _file_pos = top;
-        _buf_pos = 0;
+        _buffer_ostream = { };
        _num_allocs = 0;

        auto me = shared_from_this();
        assert(me.use_count() > 1);

-        auto * p = buf.get_write();
-        assert(std::count(p, p + 2 * sizeof(uint32_t), 0) == 2 * sizeof(uint32_t));
-
-        data_output out(p, p + buf.size());
+        auto out = buf.get_ostream();

        auto header_size = 0;

        if (off == 0) {
            // first block. write file header.
-            out.write(segment_magic);
-            out.write(_desc.ver);
-            out.write(_desc.id);
+            write(out, segment_magic);
+            write(out, _desc.ver);
+            write(out, _desc.id);
            crc32_nbo crc;
            crc.process(_desc.ver);
            crc.process<int32_t>(_desc.id & 0xffffffff);
            crc.process<int32_t>(_desc.id >> 32);
-            out.write(crc.checksum());
+            write(out, crc.checksum());
            header_size = descriptor_header_size;
        }

@@ -679,8 +676,8 @@ public:
        crc.process<int32_t>(_desc.id >> 32);
        crc.process(uint32_t(off + header_size));

-        out.write(uint32_t(_file_pos));
-        out.write(crc.checksum());
+        write(out, uint32_t(_file_pos));
+        write(out, crc.checksum());

        forget_schema_versions();

@@ -690,25 +687,32 @@ public:

        // The write will be allowed to start now, but flush (below) must wait for not only this,
        // but all previous write/flush pairs.
-        return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable {
-                auto written = make_lw_shared<size_t>(0);
-                auto p = buf.get();
-                return repeat([this, size, off, written, p]() mutable {
+        return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable { ///////////////////////////////////////////////////
+            auto view = fragmented_temporary_buffer::view(buf);
+            view.remove_suffix(buf.size_bytes() - size);
+            assert(size == view.size_bytes());
+            return do_with(off, view, [&] (uint64_t& off, fragmented_temporary_buffer::view& view) {
+                if (view.empty()) {
+                    return make_ready_future<>();
+                }
+                return repeat([this, size, &off, &view] {
                    auto&& priority_class = service::get_local_commitlog_priority();
-                    return _file.dma_write(off + *written, p + *written, size - *written, priority_class).then_wrapped([this, size, written](future<size_t>&& f) {
+                    auto current = *view.begin();
+                    return _file.dma_write(off, current.data(), current.size(), priority_class).then_wrapped([this, size, &off, &view](future<size_t>&& f) {
                        try {
                            auto bytes = std::get<0>(f.get());
-                            *written += bytes;
                            _segment_manager->totals.bytes_written += bytes;
                            _segment_manager->totals.total_size_on_disk += bytes;
                            ++_segment_manager->totals.cycle_count;
-                            if (*written == size) {
+                            if (bytes == view.size_bytes()) {
                                return make_ready_future<stop_iteration>(stop_iteration::yes);
                            }
                            // gah, partial write. should always get here with dma chunk sized
                            // "bytes", but lets make sure...
-                            clogger.debug("Partial write {}: {}/{} bytes", *this, *written, size);
-                            *written = align_down(*written, alignment);
+                            bytes = align_down(bytes, alignment);
+                            off += bytes;
+                            view.remove_prefix(bytes);
+                            clogger.debug("Partial write {}: {}/{} bytes", *this, size - view.size_bytes(), size);
                            return make_ready_future<stop_iteration>(stop_iteration::no);
                            // TODO: retry/ignore/fail/stop - optional behaviour in origin.
                            // we fast-fail the whole commit.
@@ -717,10 +721,10 @@ public:
                            throw;
                        }
                    });
-                }).finally([this, buf = std::move(buf), size]() mutable {
-                    _segment_manager->release_buffer(std::move(buf));
-                    _segment_manager->notify_memory_written(size);
                });
+            }).finally([this, buf = std::move(buf), size] {
+                    _segment_manager->notify_memory_written(size);
+            });
        }, [me, flush_after, top, rp] { // lambda instead of bind, so we keep "me" alive.
            assert(me->_pending_ops.has_operation(rp));
            return flush_after ? me->do_flush(top) : make_ready_future<sseg_ptr>(me);
@@ -786,7 +790,7 @@ public:
            return finish_and_get_new(timeout).then([id, writer = std::move(writer), permit = std::move(permit), timeout] (auto new_seg) mutable {
                return new_seg->allocate(id, std::move(writer), std::move(permit), timeout);
            });
-        } else if (!_buffer.empty() && (s > (_buffer.size() - _buf_pos))) {  // enough data?
+        } else if (!_buffer.empty() && (s > _buffer_ostream.size())) {  // enough data?
            if (_segment_manager->cfg.mode == sync_mode::BATCH) {
                // TODO: this could cause starvation if we're really unlucky.
                // If we run batch mode and find ourselves not fit in a non-empty
@@ -805,7 +809,7 @@ public:
        size_t buf_memory = s;
        if (_buffer.empty()) {
            new_buffer(s);
-            buf_memory += _buf_pos;
+            buf_memory += buffer_position();
        }

        _gate.enter(); // this might throw. I guess we accept this?
@@ -813,29 +817,26 @@ public:
        _segment_manager->account_memory_usage(buf_memory);

        replay_position rp(_desc.id, position());
-        auto pos = _buf_pos;
-        _buf_pos += s;
        _cf_dirty[id]++; // increase use count for cf.

        rp_handle h(static_pointer_cast<cf_holder>(shared_from_this()), std::move(id), rp);

-        auto * p = _buffer.get_write() + pos;
-        auto * e = _buffer.get_write() + pos + s - sizeof(uint32_t);
-
-        data_output out(p, e);
+        auto out = _buffer_ostream.write_substream(s);
        crc32_nbo crc;

-        out.write(uint32_t(s));
+        write<uint32_t>(out, s);
        crc.process(uint32_t(s));
-        out.write(crc.checksum());
+        write<uint32_t>(out, crc.checksum());

        // actual data
-        writer->write(*this, out);
+        auto entry_out = out.write_substream(size);
+        auto entry_data = entry_out.to_input_stream();
+        writer->write(*this, entry_out);
+        entry_data.with_stream([&] (auto data_str) {
+            crc.process_fragmented(ser::buffer_view<typename std::vector<temporary_buffer<char>>::iterator>(data_str));
+        });

-        crc.process_bytes(p + 2 * sizeof(uint32_t), size);
-
-        out = data_output(e, sizeof(uint32_t));
-        out.write(crc.checksum());
+        write<uint32_t>(out, crc.checksum());

        ++_segment_manager->totals.allocation_count;
        ++_num_allocs;
@@ -850,7 +851,7 @@ public:
            // If this buffer alone is too big, potentially bigger than the maximum allowed size,
            // then no other request will be allowed in to force the cycle()ing of this buffer. We
            // have to do it ourselves.
-            if ((_buf_pos >= (db::commitlog::segment::default_size))) {
+            if ((buffer_position() >= (db::commitlog::segment::default_size))) {
                cycle().discard_result().handle_exception([] (auto ex) {
                    clogger.error("Failed to flush commits to disk: {}", ex);
                });
@@ -860,7 +861,7 @@ public:
    }

    position_type position() const {
-        return position_type(_file_pos + _buf_pos);
+        return position_type(_file_pos + buffer_position());
    }

    size_t size_on_disk() const {
@@ -870,11 +871,12 @@ public:
    // ensures no more of this segment is writeable, by allocating any unused section at the end and marking it discarded
    // a.k.a. zero the tail.
    size_t clear_buffer_slack() {
-        auto size = align_up(_buf_pos, alignment);
-        std::fill(_buffer.get_write() + _buf_pos, _buffer.get_write() + size,
-                0);
-        _segment_manager->totals.bytes_slack += (size - _buf_pos);
-        _segment_manager->account_memory_usage(size - _buf_pos);
+        auto buf_pos = buffer_position();
+        auto size = align_up(buf_pos, alignment);
+        auto fill_size = size - buf_pos;
+        _buffer_ostream.fill('\0', fill_size);
+        _segment_manager->totals.bytes_slack += fill_size;
+        _segment_manager->account_memory_usage(fill_size);
        return size;
    }
    void mark_clean(const cf_id_type& id, uint64_t count) {
@@ -1187,6 +1189,34 @@ void db::commitlog::segment_manager::flush_segments(bool force) {
    }
 }

+/// \brief Helper for ensuring a file is closed if an exception is thrown.
+///
+/// The file provided by the file_fut future is passed to func.
+/// * If func throws an exception E, the file is closed and we return
+///   a failed future with E.
+/// * If func returns a value V, the file is not closed and we return
+///   a future with V.
+/// Note that when an exception is not thrown, it is the
+/// responsibility of func to make sure the file will be closed. It
+/// can close the file itself, return it, or store it somewhere.
+///
+/// \tparam Func The type of function this wraps
+/// \param file_fut A future that produces a file
+/// \param func A function that uses a file
+/// \return A future that passes the file produced by file_fut to func
+///         and closes it if func fails
+template <typename Func>
+static auto close_on_failure(future<file> file_fut, Func func) {
+    return file_fut.then([func = std::move(func)](file f) {
+        return futurize_apply(func, f).handle_exception([f] (std::exception_ptr e) mutable {
+            return f.close().then_wrapped([f, e = std::move(e)] (future<> x) {
+                using futurator = futurize<std::result_of_t<Func(file)>>;
+                return futurator::make_exception_future(e);
+            });
+        });
+    });
+}
+
 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
    static const auto flags = open_flags::wo | open_flags::create;

@@ -1217,7 +1247,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
        return fut;
    });

-    return fut.then([this, d, active, filename](file f) {
+    return close_on_failure(std::move(fut), [this, d, active, filename] (file f) {
        f = make_checked_file(commit_error_handler, f);
        // xfs doesn't like files extended betond eof, so enlarge the file
        return f.truncate(max_size).then([this, d, active, f, filename] () mutable {
@@ -1514,41 +1544,20 @@ uint64_t db::commitlog::segment_manager::get_num_active_segments() const {


 db::commitlog::segment_manager::buffer_type db::commitlog::segment_manager::acquire_buffer(size_t s) {
-    auto i = _temp_buffers.begin();
-    auto e = _temp_buffers.end();
+    s = align_up(s, segment::default_size);
+    auto fragment_count = s / segment::default_size;

-    while (i != e) {
-        if (i->size() >= s) {
-            auto r = std::move(*i);
-            _temp_buffers.erase(i);
-            totals.buffer_list_bytes -= r.size();
-            return r;
+    std::vector<temporary_buffer<char>> buffers;
+    buffers.reserve(fragment_count);
+    while (buffers.size() < fragment_count) {
+        auto a = ::memalign(segment::alignment, segment::default_size);
+        if (a == nullptr) {
+            throw std::bad_alloc();
        }
-        ++i;
-    }
-    auto a = ::memalign(segment::alignment, s);
-    if (a == nullptr) {
-        throw std::bad_alloc();
+        buffers.emplace_back(static_cast<char*>(a), segment::default_size, make_free_deleter(a));
    }
    clogger.trace("Allocated {} k buffer", s / 1024);
-    return buffer_type(reinterpret_cast<char *>(a), s, make_free_deleter(a));
-}
-
-void db::commitlog::segment_manager::release_buffer(buffer_type&& b) {
-    _temp_buffers.emplace_back(std::move(b));
-    std::sort(_temp_buffers.begin(), _temp_buffers.end(), [](const buffer_type& b1, const buffer_type& b2) {
-        return b1.size() < b2.size();
-    });
-
-    constexpr const size_t max_temp_buffers = 4;
-
-    if (_temp_buffers.size() > max_temp_buffers) {
-        clogger.trace("Deleting {} buffers", _temp_buffers.size() - max_temp_buffers);
-        _temp_buffers.erase(_temp_buffers.begin() + max_temp_buffers, _temp_buffers.end());
-    }
-    totals.buffer_list_bytes = boost::accumulate(
-	    _temp_buffers | boost::adaptors::transformed(std::mem_fn(&buffer_type::size)),
-            size_t(0), std::plus<size_t>());
+    return fragmented_temporary_buffer(std::move(buffers), s);
 }

 /**
@@ -1694,14 +1703,14 @@ const db::commitlog::config& db::commitlog::active_config() const {
 // No commit_io_check needed in the log reader since the database will fail
 // on error at startup if required
 future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
-db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off, const db::extensions* exts) {
+db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class read_io_prio_class, commit_load_reader_func next, position_type off, const db::extensions* exts) {
    struct work {
    private:
-        file_input_stream_options make_file_input_stream_options() {
+        file_input_stream_options make_file_input_stream_options(seastar::io_priority_class read_io_prio_class) {
            file_input_stream_options fo;
            fo.buffer_size = db::commitlog::segment::default_size;
            fo.read_ahead = 10;
-            fo.io_priority_class = service::get_local_commitlog_priority();
+            fo.io_priority_class = read_io_prio_class;
            return fo;
        }
    public:
@@ -1720,8 +1729,8 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        bool header = true;
        bool failed = false;

-        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
+        work(file f, seastar::io_priority_class read_io_prio_class, position_type o = 0)
+                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options(read_io_prio_class))), start_off(o) {
        }
        work(work&&) = default;

@@ -1776,7 +1785,7 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
                }

                if (magic != segment::segment_magic) {
-                    throw std::invalid_argument("Not a scylla format commitlog file");
+                    throw invalid_segment_format();
                }
                crc32_nbo crc;
                crc.process(ver);
@@ -1785,7 +1794,7 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne

                auto cs = crc.checksum();
                if (cs != checksum) {
-                    throw std::runtime_error("Checksum error in file header");
+                    throw header_checksum_error();
                }

                this->id = id;
@@ -1939,9 +1948,9 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        return fut;
    });

-    return fut.then([off, next](file f) {
+    return fut.then([off, next, read_io_prio_class] (file f) {
        f = make_checked_file(commit_error_handler, std::move(f));
-        auto w = make_lw_shared<work>(std::move(f), off);
+        auto w = make_lw_shared<work>(std::move(f), read_io_prio_class, off);
        auto ret = w->s.listen(next);

        w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -42,7 +42,6 @@

 #include <memory>

-#include "utils/data_output.hh"
 #include "core/future.hh"
 #include "core/shared_ptr.hh"
 #include "core/stream.hh"
@@ -176,7 +175,7 @@ public:
     * of data to be written. (See add).
     * Don't write less, absolutely don't write more...
     */
-    using output = data_output;
+    using output = fragmented_temporary_buffer::ostream;
    using serializer_func = std::function<void(output&)>;

    /**
@@ -343,20 +342,42 @@ public:

    typedef std::function<future<>(temporary_buffer<char>, replay_position)> commit_load_reader_func;

-    class segment_data_corruption_error: public std::runtime_error {
+    class segment_error : public std::exception {};
+
+    class segment_data_corruption_error: public segment_error {
+        std::string _msg;
    public:
        segment_data_corruption_error(std::string msg, uint64_t s)
-                : std::runtime_error(msg), _bytes(s) {
+                : _msg(std::move(msg)), _bytes(s) {
        }
        uint64_t bytes() const {
            return _bytes;
        }
+        virtual const char* what() const noexcept {
+            return _msg.c_str();
+        }
    private:
        uint64_t _bytes;
    };

+    class invalid_segment_format : public segment_error {
+        static constexpr const char* _msg = "Not a scylla format commitlog file";
+    public:
+        virtual const char* what() const noexcept {
+            return _msg;
+        }
+    };
+
+    class header_checksum_error : public segment_error {
+        static constexpr const char* _msg = "Checksum error in file header";
+    public:
+        virtual const char* what() const noexcept {
+            return _msg;
+        }
+    };
+
    static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
-            const sstring&, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
+            const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
 private:
    commitlog(config);

--- a/db/commitlog/commitlog_entry.cc
+++ b/db/commitlog/commitlog_entry.cc
@@ -51,9 +51,8 @@ void commitlog_entry_writer::compute_size() {
    _size = ms.size();
 }

-void commitlog_entry_writer::write(data_output& out) const {
-    seastar::simple_output_stream str(out.reserve(size()), size());
-    serialize(str);
+void commitlog_entry_writer::write(typename seastar::memory_output_stream<std::vector<temporary_buffer<char>>::iterator>& out) const {
+    serialize(out);
 }

 commitlog_entry_reader::commitlog_entry_reader(const temporary_buffer<char>& buffer)
--- a/db/commitlog/commitlog_entry.hh
+++ b/db/commitlog/commitlog_entry.hh
@@ -25,7 +25,6 @@

 #include "frozen_mutation.hh"
 #include "schema.hh"
-#include "utils/data_output.hh"
 #include "stdx.hh"

 class commitlog_entry {
@@ -35,7 +34,8 @@ public:
    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
        : _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
    const stdx::optional<column_mapping>& mapping() const { return _mapping; }
-    const frozen_mutation& mutation() const { return _mutation; }
+    const frozen_mutation& mutation() const & { return _mutation; }
+    frozen_mutation&& mutation() && { return std::move(_mutation); }
 };

 class commitlog_entry_writer {
@@ -72,7 +72,7 @@ public:
        return _mutation.representation().size();
    }

-    void write(data_output& out) const;
+    void write(typename seastar::memory_output_stream<std::vector<temporary_buffer<char>>::iterator>& out) const;
 };

 class commitlog_entry_reader {
@@ -81,5 +81,6 @@ public:
    commitlog_entry_reader(const temporary_buffer<char>& buffer);

    const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
-    const frozen_mutation& mutation() const { return _ce.mutation(); }
+    const frozen_mutation& mutation() const & { return _ce.mutation(); }
+    frozen_mutation&& mutation() && { return std::move(_ce).mutation(); }
 };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -58,6 +58,7 @@
 #include "converting_mutation_partition_applier.hh"
 #include "schema_registry.hh"
 #include "commitlog_entry.hh"
+#include "service/priority_manager.hh"

 static logging::logger rlogger("commitlog_replayer");

@@ -223,7 +224,7 @@ db::commitlog_replayer::impl::recover(sstring file, const sstring& fname_prefix)
    auto s = make_lw_shared<stats>();
    auto& exts = _qp.local().db().local().get_config().extensions();

-    return db::commitlog::read_log_file(file,
+    return db::commitlog::read_log_file(file, service::get_local_commitlog_priority(),
            std::bind(&impl::process, this, s.get(), std::placeholders::_1,
                    std::placeholders::_2), p, &exts).then([](auto s) {
        auto f = s->done();
--- a/db/config.cc
+++ b/db/config.cc
@@ -102,6 +102,8 @@ db::config::config()
 db::config::~config()
 {}

+const sstring db::config::default_tls_priority("SECURE128:-VERS-TLS1.0");
+
 namespace utils {

 template<>
--- a/db/config.hh
+++ b/db/config.hh
@@ -155,6 +155,9 @@ public:
    val(hints_directory, sstring, "/var/lib/scylla/hints", Used,   \
            "The directory where hints files are stored if hinted handoff is enabled."   \
    )                                           \
+    val(view_hints_directory, sstring, "/var/lib/scylla/view_hints", Used,   \
+            "The directory where materialized-view updates are stored while a view replica is unreachable."   \
+    )                                           \
    val(saved_caches_directory, sstring, "/var/lib/scylla/saved_caches", Unused, \
            "The directory location where table key and row caches are stored."  \
    )                                                   \
@@ -453,7 +456,7 @@ public:
            "The maximum number of tombstones a query can scan before aborting."  \
    )   \
    /* Network timeout settings */  \
-    val(range_request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(range_request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The time in milliseconds that the coordinator waits for sequential or index scans to complete."  \
    )   \
    val(read_request_timeout_in_ms, uint32_t, 5000, Used,     \
@@ -472,7 +475,7 @@ public:
            "The time in milliseconds that the coordinator waits for write operations to complete.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
-    val(request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The default timeout for other, miscellaneous operations.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
@@ -578,8 +581,8 @@ public:
    val(dynamic_snitch_update_interval_in_ms, uint32_t, 100, Unused,     \
            "The time interval for how often the snitch calculates node scores. Because score calculation is CPU intensive, be careful when reducing this interval."  \
    )   \
-    val(hinted_handoff_enabled, sstring, "false", Used,     \
-            "Experimental: enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
+    val(hinted_handoff_enabled, sstring, "true", Used,     \
+            "Enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
            "Related information: About hinted handoff writes"  \
    )   \
    val(hinted_handoff_throttle_in_kb, uint32_t, 1024, Unused,     \
@@ -621,7 +624,7 @@ public:
    val(thrift_framed_transport_size_in_mb, uint32_t, 15, Unused,     \
            "Frame size (maximum field length) for Thrift. The frame is the row or part of the row the application is inserting."  \
    )   \
-    val(thrift_max_message_length_in_mb, uint32_t, 16, Unused,     \
+    val(thrift_max_message_length_in_mb, uint32_t, 16, Used,     \
            "The maximum length of a Thrift message in megabytes, including all fields and internal Thrift overhead (1 byte of overhead for each frame). Message length is usually used in conjunction with batches. A frame length greater than or equal to 24 accommodates a batch with four inserts, each of which is 24 bytes. The required message length is greater than or equal to 24+24+24+24+4 (number of frames)."  \
    )   \
    /* Security properties */   \
@@ -728,7 +731,7 @@ public:
    val(prometheus_address, sstring, "0.0.0.0", Used, "Prometheus listening address") \
    val(prometheus_prefix, sstring, "scylla", Used, "Set the prefix of the exported Prometheus metrics. Changing this will break Scylla's dashboard compatibility, do not change unless you know what you are doing.") \
    val(abort_on_lsa_bad_alloc, bool, false, Used, "Abort when allocation in LSA region fails") \
-    val(murmur3_partitioner_ignore_msb_bits, unsigned, 0, Used, "Number of most siginificant token bits to ignore in murmur3 partitioner; increase for very large clusters") \
+    val(murmur3_partitioner_ignore_msb_bits, unsigned, 12, Used, "Number of most siginificant token bits to ignore in murmur3 partitioner; increase for very large clusters") \
    val(virtual_dirty_soft_limit, double, 0.6, Used, "Soft limit of virtual dirty memory expressed as a portion of the hard limit") \
    val(sstable_summary_ratio, double, 0.0005, Used, "Enforces that 1 byte of summary is written for every N (2000 by default) " \
        "bytes written to data file. Value must be between 0 and 1.") \
@@ -739,6 +742,8 @@ public:
        " Performance is affected to some extent as a result. Useful to help debugging problems that may arise at another layers.") \
    val(cpu_scheduler, bool, true, Used, "Enable cpu scheduling") \
    val(view_building, bool, true, Used, "Enable view building; should only be set to false when the node is experience issues due to view building") \
+    val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format") \
+    val(abort_on_internal_error, bool, false, Used, "Abort the server instead of throwing exception when internal invariants are violated.") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
@@ -752,6 +757,8 @@ public:
    add_options(boost::program_options::options_description_easy_init&);

    const db::extensions& extensions() const;
+
+    static const sstring default_tls_priority;
 private:
    template<typename T>
    struct log_legacy_value : public named_value<T, value_status::Used> {
--- a/db/consistency_level.cc
+++ b/db/consistency_level.cc
@@ -253,8 +253,12 @@ filter_for_query(consistency_level cl,
    return selected_endpoints;
 }

-std::vector<gms::inet_address> filter_for_query(consistency_level cl, keyspace& ks, std::vector<gms::inet_address>& live_endpoints, column_family* cf) {
-    return filter_for_query(cl, ks, live_endpoints, {}, read_repair_decision::NONE, nullptr, cf);
+std::vector<gms::inet_address> filter_for_query(consistency_level cl,
+        keyspace& ks,
+        std::vector<gms::inet_address>& live_endpoints,
+        const std::vector<gms::inet_address>& preferred_endpoints,
+        column_family* cf) {
+    return filter_for_query(cl, ks, live_endpoints, preferred_endpoints, read_repair_decision::NONE, nullptr, cf);
 }

 bool
--- a/db/consistency_level.hh
+++ b/db/consistency_level.hh
@@ -84,7 +84,11 @@ filter_for_query(consistency_level cl,
                 gms::inet_address* extra,
                 column_family* cf);

-std::vector<gms::inet_address> filter_for_query(consistency_level cl, keyspace& ks, std::vector<gms::inet_address>& live_endpoints, column_family* cf);
+std::vector<gms::inet_address> filter_for_query(consistency_level cl,
+        keyspace& ks,
+        std::vector<gms::inet_address>& live_endpoints,
+        const std::vector<gms::inet_address>& preferred_endpoints,
+        column_family* cf);

 struct dc_node_count {
    size_t live = 0;
--- a/db/cql_type_parser.cc
+++ b/db/cql_type_parser.cc
@@ -49,7 +49,10 @@
 #include "types.hh"

 static ::shared_ptr<cql3::cql3_type::raw> parse_raw(const sstring& str) {
-    return cql3::util::do_with_parser(str,  std::mem_fn(&cql3_parser::CqlParser::comparatorType));
+    return cql3::util::do_with_parser(str,
+        [] (cql3_parser::CqlParser& parser) {
+            return parser.comparator_type(true);
+        });
 }

 data_type db::cql_type_parser::parse(const sstring& keyspace, const sstring& str, lw_shared_ptr<user_types_metadata> user_types) {
--- a/db/heat_load_balance.cc
+++ b/db/heat_load_balance.cc
@@ -28,8 +28,7 @@ logging::logger hr_logger("heat_load_balance");
 // Return a uniformly-distributed random number in [0,1)
 // We use per-thread state for thread safety.  We seed the random number generator
 // once with a real random value, if available,
-static thread_local std::random_device r;
-static thread_local std::default_random_engine random_engine(r());
+static thread_local std::default_random_engine random_engine{std::random_device{}()};
 float
 rand_float() {
    static thread_local std::uniform_real_distribution<float> u(0, 1);
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -20,9 +20,11 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <algorithm>
 #include <seastar/core/future.hh>
 #include <seastar/core/seastar.hh>
 #include <seastar/core/gate.hh>
+#include <boost/range/adaptors.hpp>
 #include "service/storage_service.hh"
 #include "utils/div_ceil.hh"
 #include "db/config.hh"
@@ -33,6 +35,9 @@
 #include "disk-error-handler.hh"
 #include "lister.hh"
 #include "db/timeout_clock.hh"
+#include "service/priority_manager.hh"
+
+using namespace std::literals::chrono_literals;

 namespace db {
 namespace hints {
@@ -74,6 +79,12 @@ void manager::register_metrics(const sstring& group_name) {

        sm::make_derive("sent", _stats.sent,
                        sm::description("Number of sent hints.")),
+
+        sm::make_derive("discarded", _stats.discarded,
+                        sm::description("Number of hints that were discarded during sending (too old, schema changed, etc.).")),
+
+        sm::make_derive("corrupted_files", _stats.corrupted_files,
+                        sm::description("Number of hints files that were discarded during sending because the file was corrupted.")),
    });
 }

@@ -91,6 +102,7 @@ future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr
        return compute_hints_dir_device_id();
    }).then([this] {
        _strorage_service_anchor->register_subscriber(this);
+        set_started();
    });
 }

@@ -101,12 +113,12 @@ future<> manager::stop() {
        _strorage_service_anchor->unregister_subscriber(this);
    }

-    _stopping = true;
+    set_stopping();

    return _draining_eps_gate.close().finally([this] {
        return parallel_for_each(_ep_managers, [] (auto& pair) {
-                return pair.second.stop();
-            }).finally([this] {
+            return pair.second.stop();
+        }).finally([this] {
            _ep_managers.clear();
            manager_logger.info("Stopped");
        }).discard_result();
@@ -227,6 +239,8 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
 manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, manager& shard_manager)
    : _key(key)
    , _shard_manager(shard_manager)
+    , _file_update_mutex_ptr(make_lw_shared<seastar::shared_mutex>())
+    , _file_update_mutex(*_file_update_mutex_ptr)
    , _state(state_set::of<state::stopped>())
    , _hints_dir(_shard_manager.hints_dir() / format("{}", _key).c_str())
    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
@@ -235,6 +249,8 @@ manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, m
 manager::end_point_hints_manager::end_point_hints_manager(end_point_hints_manager&& other)
    : _key(other._key)
    , _shard_manager(other._shard_manager)
+    , _file_update_mutex_ptr(std::move(other._file_update_mutex_ptr))
+    , _file_update_mutex(*_file_update_mutex_ptr)
    , _state(other._state)
    , _hints_dir(std::move(other._hints_dir))
    , _sender(other._sender, *this)
@@ -273,7 +289,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
 }

 bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
-    if (_stopping || !can_hint_for(ep)) {
+    if (stopping() || !started() || !can_hint_for(ep)) {
        manager_logger.trace("Can't store a hint to {}", ep);
        ++_stats.dropped;
        return false;
@@ -376,7 +392,7 @@ future<timespec> manager::end_point_hints_manager::sender::get_last_file_modific
    });
 }

-future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
+future<> manager::end_point_hints_manager::sender::do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
    return futurize_apply([this, m = std::move(m), &natural_endpoints] () mutable -> future<> {
        // The fact that we send with CL::ALL in both cases below ensures that new hints are not going
        // to be generated as a result of hints sending.
@@ -385,7 +401,11 @@ future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation
            return _proxy.send_to_endpoint(std::move(m), end_point_key(), { }, write_type::SIMPLE);
        } else {
            manager_logger.trace("Endpoints set has changed and {} is no longer a replica. Mutating from scratch...", end_point_key());
-            return _proxy.mutate({std::move(m)}, consistency_level::ALL, nullptr);
+            // FIXME: using 1h as infinite timeout. If a node is down, we should get an
+            // unavailable exception.
+            auto timeout = db::timeout_clock::now() + 1h;
+            //FIXME: Add required frozen_mutation overloads
+            return _proxy.mutate({m.fm.unfreeze(m.s)}, consistency_level::ALL, timeout, nullptr);
        }
    });
 }
@@ -411,21 +431,19 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {
    }
 }

-mutation manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
+frozen_mutation_and_schema manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
    hint_entry_reader hr(buf);
    auto& fm = hr.mutation();
    auto& cm = get_column_mapping(std::move(ctx_ptr), fm, hr);
-    auto& cf = _db.find_column_family(fm.column_family_id());
+    auto schema = _db.find_schema(fm.column_family_id());

-    if (cf.schema()->version() != fm.schema_version()) {
-        mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
-        converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
+    if (schema->version() != fm.schema_version()) {
+        mutation m(schema, fm.decorated_key(*schema));
+        converting_mutation_partition_applier v(cm, *schema, m.partition());
        fm.partition().accept(cm, v);
-
-        return std::move(m);
-    } else {
-        return fm.unfreeze(cf.schema());
+        return {freeze(m), std::move(schema)};
    }
+    return {std::move(hr).mutation(), std::move(schema)};
 }

 const column_mapping& manager::end_point_hints_manager::sender::get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr) {
@@ -495,35 +513,42 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
 }

 void manager::drain_for(gms::inet_address endpoint) {
-    if (_stopping) {
+    if (stopping()) {
        return;
    }

    manager_logger.trace("on_leave_cluster: {} is removed/decommissioned", endpoint);

    with_gate(_draining_eps_gate, [this, endpoint] {
-        return futurize_apply([this, endpoint] () {
-            if (utils::fb_utilities::is_me(endpoint)) {
-                return parallel_for_each(_ep_managers, [] (auto& pair) {
-                    return pair.second.stop(drain::yes).finally([&pair] {
-                        return remove_file(pair.second.hints_dir().c_str());
+        return with_semaphore(drain_lock(), 1, [this, endpoint] {
+            return futurize_apply([this, endpoint] () {
+                if (utils::fb_utilities::is_me(endpoint)) {
+                    return parallel_for_each(_ep_managers, [] (auto& pair) {
+                        return pair.second.stop(drain::yes).finally([&pair] {
+                            return with_file_update_mutex(pair.second, [&pair] {
+                                return remove_file(pair.second.hints_dir().c_str());
+                            });
+                        });
+                    }).finally([this] {
+                        _ep_managers.clear();
                    });
-                }).finally([this] {
-                    _ep_managers.clear();
-                });
-            } else {
-                ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
-                if (ep_manager_it != ep_managers_end()) {
-                    return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, hints_dir = ep_manager_it->second.hints_dir()] {
-                        _ep_managers.erase(endpoint);
-                        return remove_file(hints_dir.c_str());
-                    });
-                }
+                } else {
+                    ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
+                    if (ep_manager_it != ep_managers_end()) {
+                        return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, &ep_man = ep_manager_it->second] {
+                            return with_file_update_mutex(ep_man, [&ep_man] {
+                                return remove_file(ep_man.hints_dir().c_str());
+                            }).finally([this, endpoint] {
+                                _ep_managers.erase(endpoint);
+                            });
+                        });
+                    }

-                return make_ready_future<>();
-            }
-        }).handle_exception([endpoint] (auto eptr) {
-            manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
+                    return make_ready_future<>();
+                }
+            }).handle_exception([endpoint] (auto eptr) {
+                manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
+            });
        });
    });
 }
@@ -536,6 +561,7 @@ manager::end_point_hints_manager::sender::sender(end_point_hints_manager& parent
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(local_storage_proxy)
    , _db(local_db)
+    , _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
    , _gossiper(local_gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -548,6 +574,7 @@ manager::end_point_hints_manager::sender::sender(const sender& other, end_point_
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(other._proxy)
    , _db(other._db)
+    , _hints_cpu_sched_group(other._hints_cpu_sched_group)
    , _gossiper(other._gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -603,7 +630,10 @@ manager::end_point_hints_manager::sender::clock::duration manager::end_point_hin
 }

 void manager::end_point_hints_manager::sender::start() {
-    _stopped = seastar::async([this] {
+    seastar::thread_attributes attr;
+
+    attr.sched_group = _hints_cpu_sched_group;
+    _stopped = seastar::async(std::move(attr), [this] {
        manager_logger.trace("ep_manager({})::sender: started", end_point_key());
        while (!stopping()) {
            try {
@@ -623,10 +653,11 @@ void manager::end_point_hints_manager::sender::start() {
    });
 }

-future<> manager::end_point_hints_manager::sender::send_one_mutation(mutation m) {
-    keyspace& ks = _db.find_keyspace(m.schema()->ks_name());
+future<> manager::end_point_hints_manager::sender::send_one_mutation(frozen_mutation_and_schema m) {
+    keyspace& ks = _db.find_keyspace(m.s->ks_name());
    auto& rs = ks.get_replication_strategy();
-    std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(m.token());
+    auto token = dht::global_partitioner().get_token(*m.s, m.fm.key(*m.s));
+    std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(std::move(token));

    return do_send_one_mutation(std::move(m), natural_endpoints);
 }
@@ -644,8 +675,8 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
                    return make_ready_future<>();
                }

-                mutation m = this->get_mutation(ctx_ptr, buf);
-                gc_clock::duration gc_grace_sec = m.schema()->gc_grace_seconds();
+                auto m = this->get_mutation(ctx_ptr, buf);
+                gc_clock::duration gc_grace_sec = m.s->gc_grace_seconds();

                // The hint is too old - drop it.
                //
@@ -666,10 +697,13 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
            // ignore these errors and move on - probably this hint is too old and the KS/CF has been deleted...
            } catch (no_such_column_family& e) {
                manager_logger.debug("send_hints(): no_such_column_family: {}", e.what());
+                ++this->shard_stats().discarded;
            } catch (no_such_keyspace& e) {
                manager_logger.debug("send_hints(): no_such_keyspace: {}", e.what());
+                ++this->shard_stats().discarded;
            } catch (no_column_mapping& e) {
-                manager_logger.debug("send_hints(): {}: {}", fname, e.what());
+                manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
+                ++this->shard_stats().discarded;
            }
            return make_ready_future<>();
        }).finally([units = std::move(units), ctx_ptr] {});
@@ -683,10 +717,10 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
 bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fname) {
    timespec last_mod = get_last_file_modification(fname).get0();
    gc_clock::duration secs_since_file_mod = std::chrono::seconds(last_mod.tv_sec);
-    lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>();
+    lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>(_last_schema_ver_to_column_mapping);

    try {
-        auto s = commitlog::read_log_file(fname, [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
+        auto s = commitlog::read_log_file(fname, service::get_local_streaming_read_priority(), [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
            // Check that we can still send the next hint. Don't try to send it if the destination host
            // is DOWN or if we have already failed to send some of the previous hints.
            if (!draining() && ctx_ptr->state.contains(send_state::segment_replay_failed)) {
@@ -705,6 +739,10 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
        }, _last_not_complete_rp.pos, &_db.get_config().extensions()).get0();

        s->done().get();
+    } catch (db::commitlog::segment_error& ex) {
+        manager_logger.error("{}: {}. Dropping...", fname, ex.what());
+        ctx_ptr->state.remove(send_state::segment_replay_failed);
+        ++this->shard_stats().corrupted_files;
    } catch (...) {
        manager_logger.trace("sending of {} failed: {}", fname, std::current_exception());
        ctx_ptr->state.set(send_state::segment_replay_failed);
@@ -740,6 +778,7 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam

    // clear the replay position - we are going to send the next segment...
    _last_not_complete_rp = replay_position();
+    _last_schema_ver_to_column_mapping.clear();
    manager_logger.trace("send_one_file(): segment {} was sent in full and deleted", fname);
    return true;
 }
@@ -752,7 +791,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    int replayed_segments_count = 0;

    try {
-        while (have_segments()) {
+        while (replay_allowed() && have_segments()) {
            if (!send_one_file(*_segments_to_replay.begin())) {
                break;
            }
@@ -777,5 +816,173 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
 }

+template<typename Func>
+static future<> scan_for_hints_dirs(const sstring& hints_directory, Func&& f) {
+    return lister::scan_dir(hints_directory, { directory_entry_type::directory }, [f = std::forward<Func>(f)] (lister::path dir, directory_entry de) {
+        try {
+            return f(std::move(dir), std::move(de), std::stoi(de.name.c_str()));
+        } catch (std::invalid_argument& ex) {
+            manager_logger.debug("Ignore invalid directory {}", de.name);
+            return make_ready_future<>();
+        }
+    });
+}
+
+// runs in seastar::async context
+manager::hints_segments_map manager::get_current_hints_segments(const sstring& hints_directory) {
+    hints_segments_map current_hints_segments;
+
+    // shards level
+    scan_for_hints_dirs(hints_directory, [&current_hints_segments] (lister::path dir, directory_entry de, unsigned shard_id) {
+        manager_logger.trace("shard_id = {}", shard_id);
+        // IPs level
+        return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory }, [&current_hints_segments, shard_id] (lister::path dir, directory_entry de) {
+            manager_logger.trace("\tIP: {}", de.name);
+            // hints files
+            return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::regular }, [&current_hints_segments, shard_id, ep_addr = de.name] (lister::path dir, directory_entry de) {
+                manager_logger.trace("\t\tfile: {}", de.name);
+                current_hints_segments[ep_addr][shard_id].emplace_back(dir / de.name.c_str());
+                return make_ready_future<>();
+            });
+        });
+    }).get();
+
+    return current_hints_segments;
+}
+
+// runs in seastar::async context
+void manager::rebalance_segments(const sstring& hints_directory, hints_segments_map& segments_map) {
+    // Count how many hints segments to each destination we have.
+    std::unordered_map<sstring, size_t> per_ep_hints;
+    for (auto& ep_info : segments_map) {
+        per_ep_hints[ep_info.first] = boost::accumulate(ep_info.second | boost::adaptors::map_values | boost::adaptors::transformed(std::mem_fn(&std::list<lister::path>::size)), 0);
+        manager_logger.trace("{}: total files: {}", ep_info.first, per_ep_hints[ep_info.first]);
+    }
+
+    // Create a map of lists of segments that we will move (for each destination end point): if a shard has segments
+    // then we will NOT move q = int(N/S) segments out of them, where N is a total number of segments to the current
+    // destination and S is a current number of shards.
+    std::unordered_map<sstring, std::list<lister::path>> segments_to_move;
+    for (auto& [ep, ep_segments] : segments_map) {
+        size_t q = per_ep_hints[ep] / smp::count;
+        auto& current_segments_to_move = segments_to_move[ep];
+
+        for (auto& [shard_id, shard_segments] : ep_segments) {
+            // Move all segments from the shards that are no longer relevant (re-sharding to the lower number of shards)
+            if (shard_id >= smp::count) {
+                current_segments_to_move.splice(current_segments_to_move.end(), shard_segments);
+            } else if (shard_segments.size() > q) {
+                current_segments_to_move.splice(current_segments_to_move.end(), shard_segments, std::next(shard_segments.begin(), q), shard_segments.end());
+            }
+        }
+    }
+
+    // Since N (a total number of segments to a specific destination) may be not a multiple of S (a current number of
+    // shards) we will distribute files in two passes:
+    //    * if N = S * q + r, then
+    //       * one pass for segments_per_shard = q
+    //       * another one for segments_per_shard = q + 1.
+    //
+    // This way we will ensure as close to the perfect distribution as possible.
+    //
+    // Right till this point we haven't moved any segments. However we have created a logical separation of segments
+    // into two groups:
+    //    * Segments that are not going to be moved: segments in the segments_map.
+    //    * Segments that are going to be moved: segments in the segments_to_move.
+    //
+    // rebalance_segments_for() is going to consume segments from segments_to_move and move them to corresponding
+    // lists in the segments_map AND actually move segments to the corresponding shard's sub-directory till the requested
+    // segments_per_shard level is reached (see more details in the description of rebalance_segments_for()).
+    for (auto& [ep, N] : per_ep_hints) {
+        size_t q = N / smp::count;
+        size_t r = N - q * smp::count;
+        auto& current_segments_to_move = segments_to_move[ep];
+        auto& current_segments_map = segments_map[ep];
+
+        if (q) {
+            rebalance_segments_for(ep, q, hints_directory, current_segments_map, current_segments_to_move);
+        }
+
+        if (r) {
+            rebalance_segments_for(ep, q + 1, hints_directory, current_segments_map, current_segments_to_move);
+        }
+    }
+}
+
+// runs in seastar::async context
+void manager::rebalance_segments_for(
+        const sstring& ep,
+        size_t segments_per_shard,
+        const sstring& hints_directory,
+        hints_ep_segments_map& ep_segments,
+        std::list<lister::path>& segments_to_move)
+{
+    manager_logger.trace("{}: segments_per_shard: {}, total number of segments to move: {}", ep, segments_per_shard, segments_to_move.size());
+
+    // sanity check
+    if (segments_to_move.empty() || !segments_per_shard) {
+        return;
+    }
+
+    for (unsigned i = 0; i < smp::count && !segments_to_move.empty(); ++i) {
+        lister::path shard_path_dir(lister::path(hints_directory.c_str()) / seastar::format("{:d}", i).c_str() / ep.c_str());
+        std::list<lister::path>& current_shard_segments = ep_segments[i];
+
+        // Make sure that the shard_path_dir exists and if not - create it
+        io_check(recursive_touch_directory, shard_path_dir.c_str()).get();
+
+        while (current_shard_segments.size() < segments_per_shard && !segments_to_move.empty()) {
+            auto seg_path_it = segments_to_move.begin();
+            lister::path new_path(shard_path_dir / seg_path_it->filename());
+
+            // Don't move the file to the same location - it's pointless.
+            if (*seg_path_it != new_path) {
+                manager_logger.trace("going to move: {} -> {}", *seg_path_it, new_path);
+                io_check(rename_file, seg_path_it->native(), new_path.native()).get();
+            } else {
+                manager_logger.trace("skipping: {}", *seg_path_it);
+            }
+            current_shard_segments.splice(current_shard_segments.end(), segments_to_move, seg_path_it, std::next(seg_path_it));
+        }
+    }
+}
+
+// runs in seastar::async context
+void manager::remove_irrelevant_shards_directories(const sstring& hints_directory) {
+    // shards level
+    scan_for_hints_dirs(hints_directory, [] (lister::path dir, directory_entry de, unsigned shard_id) {
+        if (shard_id >= smp::count) {
+            // IPs level
+            return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory, directory_entry_type::regular }, lister::show_hidden::yes, [] (lister::path dir, directory_entry de) {
+                return io_check(remove_file, (dir / de.name.c_str()).native());
+            }).then([shard_base_dir = dir, shard_entry = de] {
+                return io_check(remove_file, (shard_base_dir / shard_entry.name.c_str()).native());
+            });
+        }
+        return make_ready_future<>();
+    }).get();
+}
+
+future<> manager::rebalance(sstring hints_directory) {
+    return seastar::async([hints_directory = std::move(hints_directory)] {
+        // Scan currently present hints segments.
+        hints_segments_map current_hints_segments = get_current_hints_segments(hints_directory);
+
+        // Move segments to achieve an even distribution of files among all present shards.
+        rebalance_segments(hints_directory, current_hints_segments);
+
+        // Remove the directories of shards that are not present anymore - they should not have any segments by now
+        remove_irrelevant_shards_directories(hints_directory);
+    });
+}
+
+void manager::update_backlog(size_t backlog, size_t max_backlog) {
+    if (backlog < max_backlog) {
+        allow_hints();
+    } else {
+        forbid_hints_for_eps_with_pending_hints();
+    }
+}
+
 }
 }
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -31,6 +31,7 @@
 #include <seastar/core/timer.hh>
 #include <seastar/core/lowres_clock.hh>
 #include <seastar/core/shared_mutex.hh>
+#include "lister.hh"
 #include "gms/gossiper.hh"
 #include "locator/snitch_base.hh"
 #include "service/endpoint_lifecycle_subscriber.hh"
@@ -58,11 +59,20 @@ private:
        uint64_t errors = 0;
        uint64_t dropped = 0;
        uint64_t sent = 0;
+        uint64_t discarded = 0;
+        uint64_t corrupted_files = 0;
    };

+    // map: shard -> segments
+    using hints_ep_segments_map = std::unordered_map<unsigned, std::list<lister::path>>;
+    // map: IP -> map: shard -> segments
+    using hints_segments_map = std::unordered_map<sstring, hints_ep_segments_map>;
+
    class drain_tag {};
    using drain = seastar::bool_class<drain_tag>;

+    friend class space_watchdog;
+
 public:
    class end_point_hints_manager {
    public:
@@ -94,7 +104,10 @@ public:
                send_state::restart_segment>>;

            struct send_one_file_ctx {
-                std::unordered_map<table_schema_version, column_mapping> schema_ver_to_column_mapping;
+                send_one_file_ctx(std::unordered_map<table_schema_version, column_mapping>& last_schema_ver_to_column_mapping)
+                    : schema_ver_to_column_mapping(last_schema_ver_to_column_mapping)
+                {}
+                std::unordered_map<table_schema_version, column_mapping>& schema_ver_to_column_mapping;
                seastar::gate file_send_gate;
                std::unordered_set<db::replay_position> rps_set; // number of elements in this set is never going to be greater than the maximum send queue length
                send_state_set state;
@@ -103,6 +116,7 @@ public:
        private:
            std::list<sstring> _segments_to_replay;
            replay_position _last_not_complete_rp;
+            std::unordered_map<table_schema_version, column_mapping> _last_schema_ver_to_column_mapping;
            state_set _state;
            future<> _stopped;
            clock::time_point _next_flush_tp;
@@ -113,6 +127,7 @@ public:
            resource_manager& _resource_manager;
            service::storage_proxy& _proxy;
            database& _db;
+            seastar::scheduling_group _hints_cpu_sched_group;
            gms::gossiper& _gossiper;
            seastar::shared_mutex& _file_update_mutex;

@@ -173,6 +188,10 @@ public:
                return _state.contains(state::stopping);
            }

+            bool replay_allowed() const noexcept {
+                return _ep_manager.replay_allowed();
+            }
+
            /// \brief Try to send one hint read from the file.
            ///  - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
            ///  - Discard the hints that are older than the grace seconds value of the corresponding table.
@@ -204,7 +223,7 @@ public:
            /// \param ctx_ptr pointer to the send context
            /// \param buf hints file entry
            /// \return The mutation object representing the original mutation stored in the hints file.
-            mutation get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
+            frozen_mutation_and_schema get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);

            /// \brief Get a reference to the column_mapping object for a given frozen mutation.
            /// \param ctx_ptr pointer to the send context
@@ -221,13 +240,13 @@ public:
            /// \param m mutation to send
            /// \param natural_endpoints current replicas for the given mutation
            /// \return future that resolves when the operation is complete
-            future<> do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
+            future<> do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;

            /// \brief Send one mutation out.
            ///
            /// \param m mutation to send
            /// \return future that resolves when the mutation sending processing is complete.
-            future<> send_one_mutation(mutation m);
+            future<> send_one_mutation(frozen_mutation_and_schema m);

            /// \brief Get the last modification time stamp for a given file.
            /// \param fname File name
@@ -256,7 +275,8 @@ public:
        manager& _shard_manager;
        hints_store_ptr _hints_store_anchor;
        seastar::gate _store_gate;
-        seastar::shared_mutex _file_update_mutex;
+        lw_shared_ptr<seastar::shared_mutex> _file_update_mutex_ptr;
+        seastar::shared_mutex& _file_update_mutex;

        enum class state {
            can_hint,               // hinting is currently allowed (used by the space_watchdog)
@@ -322,6 +342,10 @@ public:
            return _hints_in_progress;
        }

+        bool replay_allowed() const noexcept {
+            return _shard_manager.replay_allowed();
+        }
+
        bool can_hint() const noexcept {
            return _state.contains(state::can_hint);
        }
@@ -354,8 +378,20 @@ public:
            return _state.contains(state::stopped);
        }

-        seastar::shared_mutex& file_update_mutex() {
-            return _file_update_mutex;
+        /// \brief Safely runs a given functor under the file_update_mutex of \ref ep_man
+        ///
+        /// Runs a given functor under the file_update_mutex of the given end_point_hints_manager instance.
+        /// This function is safe even if \ref ep_man gets destroyed before the future this function returns resolves
+        /// (as long as the \ref func call itself is safe).
+        ///
+        /// \tparam Func Functor type.
+        /// \param ep_man end_point_hints_manager instance which file_update_mutex we want to lock.
+        /// \param func Functor to run under the lock.
+        /// \return Whatever \ref func returns.
+        template <typename Func>
+        friend inline auto with_file_update_mutex(end_point_hints_manager& ep_man, Func&& func) {
+            lw_shared_ptr<seastar::shared_mutex> lock_ptr = ep_man._file_update_mutex_ptr;
+            return with_lock(*lock_ptr, std::forward<Func>(func)).finally([lock_ptr] {});
        }

        const boost::filesystem::path& hints_dir() const noexcept {
@@ -363,6 +399,10 @@ public:
        }

    private:
+        seastar::shared_mutex& file_update_mutex() noexcept {
+            return _file_update_mutex;
+        }
+
        /// \brief Creates a new hints store object.
        ///
        /// - Creates a hints store directory if doesn't exist: <shard_hints_dir>/<ep_key>
@@ -387,6 +427,17 @@ public:
        }
    };

+    enum class state {
+        started,                // hinting is currently allowed (start() call is complete)
+        replay_allowed,         // replaying (hints sending) is allowed
+        stopping                // hinting is not allowed - stopping is in progress (stop() method has been called)
+    };
+
+    using state_set = enum_set<super_enum<state,
+        state::started,
+        state::replay_allowed,
+        state::stopping>>;
+
 private:
    using ep_key_type = typename end_point_hints_manager::key_type;
    using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;
@@ -397,6 +448,7 @@ public:
    static const std::chrono::seconds hint_file_write_timeout;

 private:
+    state_set _state;
    const boost::filesystem::path _hints_dir;
    dev_t _hints_dir_device_id = 0;

@@ -408,7 +460,7 @@ private:
    locator::snitch_ptr& _local_snitch_ptr;
    int64_t _max_hint_window_us = 0;
    database& _local_db;
-    bool _stopping = false;
+
    seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call

    resource_manager& _resource_manager;
@@ -417,10 +469,13 @@ private:
    stats _stats;
    seastar::metrics::metric_groups _metrics;
    std::unordered_set<ep_key_type> _eps_with_pending_hints;
+    seastar::semaphore _drain_lock = {1};

 public:
    manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
    virtual ~manager();
+    manager(manager&&) = delete;
+    manager& operator=(manager&&) = delete;
    void register_metrics(const sstring& group_name);
    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
    future<> stop();
@@ -493,27 +548,101 @@ public:
        return _hints_dir_device_id;
    }

+    seastar::semaphore& drain_lock() noexcept {
+        return _drain_lock;
+    }
+
    void allow_hints();
    void forbid_hints();
    void forbid_hints_for_eps_with_pending_hints();

-
-    static future<> rebalance() {
-        // TODO
-        return make_ready_future<>();
+    void allow_replaying() noexcept {
+        _state.set(state::replay_allowed);
    }

+    /// \brief Rebalance hints segments among all present shards.
+    ///
+    /// The difference between the number of segments on every two shard will be not greater than 1 after the
+    /// rebalancing.
+    ///
+    /// Removes the sub-directories of \ref hints_directory that correspond to shards that are not relevant any more
+    /// (re-sharding to a lower shards number case).
+    ///
+    /// Complexity: O(N+K), where N is a total number of present hints' segments and
+    ///                           K = <number of shards during the previous boot> * <number of end points for which hints where ever created>
+    ///
+    /// \param hints_directory A hints directory to rebalance
+    /// \return A future that resolves when the operation is complete.
+    static future<> rebalance(sstring hints_directory);
+
    virtual void on_join_cluster(const gms::inet_address& endpoint) override {}
    virtual void on_leave_cluster(const gms::inet_address& endpoint) override {
        drain_for(endpoint);
    };
    virtual void on_up(const gms::inet_address& endpoint) override {}
    virtual void on_down(const gms::inet_address& endpoint) override {}
-    virtual void on_move(const gms::inet_address& endpoint) override {}

 private:
    future<> compute_hints_dir_device_id();

+    /// \brief Scan the given hints directory and build the map of all present hints segments.
+    ///
+    /// Complexity: O(N+K), where N is a total number of present hints' segments and
+    ///                           K = <number of shards during the previous boot> * <number of end points for which hints where ever created>
+    ///
+    /// \note Should be called from a seastar::thread context.
+    ///
+    /// \param hints_directory directory to scan
+    /// \return a map: ep -> map: shard -> segments (full paths)
+    static hints_segments_map get_current_hints_segments(const sstring& hints_directory);
+
+    /// \brief Rebalance hints segments for a given (destination) end point
+    ///
+    /// This method is going to consume files from the \ref segments_to_move and distribute them between the present
+    /// shards (taking into an account the \ref ep_segments state - there may be zero or more segments that belong to a
+    /// particular shard in it) until we either achieve the requested \ref segments_per_shard level on each shard
+    /// or until we are out of files to move.
+    ///
+    /// As a result (in addition to the actual state on the disk) both \ref ep_segments and \ref segments_to_move are going
+    /// to be modified.
+    ///
+    /// Complexity: O(N), where N is a total number of present hints' segments for the \ref ep end point (as a destination).
+    ///
+    /// \note Should be called from a seastar::thread context.
+    ///
+    /// \param ep destination end point ID (a string with its IP address)
+    /// \param segments_per_shard number of hints segments per-shard we want to achieve
+    /// \param hints_directory a root hints directory
+    /// \param ep_segments a map that was originally built by get_current_hints_segments() for this end point
+    /// \param segments_to_move a list of segments we are allowed to move
+    static void rebalance_segments_for(
+            const sstring& ep,
+            size_t segments_per_shard,
+            const sstring& hints_directory,
+            hints_ep_segments_map& ep_segments,
+            std::list<lister::path>& segments_to_move);
+
+    /// \brief Rebalance all present hints segments.
+    ///
+    /// The difference between the number of segments on every two shard will be not greater than 1 after the
+    /// rebalancing.
+    ///
+    /// Complexity: O(N), where N is a total number of present hints' segments.
+    ///
+    /// \note Should be called from a seastar::thread context.
+    ///
+    /// \param hints_directory a root hints directory
+    /// \param segments_map a map that was built by get_current_hints_segments()
+    static void rebalance_segments(const sstring& hints_directory, hints_segments_map& segments_map);
+
+    /// \brief Remove sub-directories of shards that are not relevant any more (re-sharding to a lower number of shards case).
+    ///
+    /// Complexity: O(S*E), where S is a number of shards during the previous boot and
+    ///                           E is a number of end points for which hints where ever created.
+    ///
+    /// \param hints_directory a root hints directory
+    static void remove_irrelevant_shards_directories(const sstring& hints_directory);
+
    node_to_hint_store_factory_type& store_factory() noexcept {
        return _store_factory;
    }
@@ -544,6 +673,28 @@ private:
    /// \param endpoint node that left the cluster
    void drain_for(gms::inet_address endpoint);

+    void update_backlog(size_t backlog, size_t max_backlog);
+
+    bool stopping() const noexcept {
+        return _state.contains(state::stopping);
+    }
+
+    void set_stopping() noexcept {
+        _state.set(state::stopping);
+    }
+
+    bool started() const noexcept {
+        return _state.contains(state::started);
+    }
+
+    void set_started() noexcept {
+        _state.set(state::started);
+    }
+
+    bool replay_allowed() const noexcept {
+        return _state.contains(state::replay_allowed);
+    }
+
 public:
    ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
        return _ep_managers.find(ep_key);
--- a/db/hints/resource_manager.cc
+++ b/db/hints/resource_manager.cc
@@ -27,6 +27,7 @@
 #include "lister.hh"
 #include "disk-error-handler.hh"
 #include "seastarx.hh"
+#include <seastar/core/sleep.hh>

 namespace db {
 namespace hints {
@@ -65,112 +66,111 @@ const std::chrono::seconds space_watchdog::_watchdog_period = std::chrono::secon
 space_watchdog::space_watchdog(shard_managers_set& managers, per_device_limits_map& per_device_limits_map)
    : _shard_managers(managers)
    , _per_device_limits_map(per_device_limits_map)
-    , _timer([this] { on_timer(); })
 {}

 void space_watchdog::start() {
-    _timer.arm(timer_clock_type::now());
+    _started = seastar::async([this] {
+        while (!_as.abort_requested()) {
+            try {
+                on_timer();
+            } catch (...) {
+                resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
+                // Stop all hint generators if space_watchdog callback failed
+                for (manager& shard_manager : _shard_managers) {
+                    shard_manager.forbid_hints();
+                }
+            }
+            seastar::sleep_abortable(_watchdog_period, _as).get();
+        }
+    }).handle_exception_type([] (const seastar::sleep_aborted& ignored) { });
 }

 future<> space_watchdog::stop() noexcept {
-    try {
-        return _gate.close().finally([this] { _timer.cancel(); });
-    } catch (...) {
-        return make_exception_future<>(std::current_exception());
-    }
+    _as.request_abort();
+    return std::move(_started);
 }

+// Called under the end_point_hints_manager::file_update_mutex() of the corresponding end_point_hints_manager instance.
 future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
-    return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (lister::path dir, directory_entry de) {
-        // Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
-        if (_files_count == 1) {
-            shard_manager.add_ep_with_pending_hints(ep_key);
-        }
-        ++_files_count;
+    return do_with(std::move(path), [this, ep_key, &shard_manager] (boost::filesystem::path& path) {
+        // It may happen that we get here and the directory has already been deleted in the context of manager::drain_for().
+        // In this case simply bail out.
+        return engine().file_exists(path.native()).then([this, ep_key, &shard_manager, &path] (bool exists) {
+            if (!exists) {
+                return make_ready_future<>();
+            } else {
+                return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (lister::path dir, directory_entry de) {
+                    // Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
+                    if (_files_count == 1) {
+                        shard_manager.add_ep_with_pending_hints(ep_key);
+                    }
+                    ++_files_count;

-        return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
-            _total_size += fsize;
+                    return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
+                        _total_size += fsize;
+                    });
+                });
+            }
        });
    });
 }

+// Called from the context of a seastar::thread.
 void space_watchdog::on_timer() {
-    with_gate(_gate, [this] {
-        return futurize_apply([this] {
-            _total_size = 0;
+    // The hints directories are organized as follows:
+    // <hints root>
+    //    |- <shard1 ID>
+    //    |  |- <EP1 address>
+    //    |     |- <hints file1>
+    //    |     |- <hints file2>
+    //    |     |- ...
+    //    |  |- <EP2 address>
+    //    |     |- ...
+    //    |  |-...
+    //    |- <shard2 ID>
+    //    |  |- ...
+    //    ...
+    //    |- <shardN ID>
+    //    |  |- ...
+    //

-            return do_for_each(_shard_managers, [this] (manager& shard_manager) {
-                shard_manager.clear_eps_with_pending_hints();
-
-                // The hints directories are organized as follows:
-                // <hints root>
-                //    |- <shard1 ID>
-                //    |  |- <EP1 address>
-                //    |     |- <hints file1>
-                //    |     |- <hints file2>
-                //    |     |- ...
-                //    |  |- <EP2 address>
-                //    |     |- ...
-                //    |  |-...
-                //    |- <shard2 ID>
-                //    |  |- ...
-                //    ...
-                //    |- <shardN ID>
-                //    |  |- ...
+    for (auto& per_device_limits : _per_device_limits_map | boost::adaptors::map_values) {
+        _total_size = 0;
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.clear_eps_with_pending_hints();
+            lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
+                _files_count = 0;
+                // Let's scan per-end-point directories and enumerate hints files...
                //
-                return lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
-                    _files_count = 0;
-                    // Let's scan per-end-point directories and enumerate hints files...
-                    //
-                    // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
-                    // not hintable).
-                    // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
-                    // continue to enumeration - there is no one to change them.
-                    auto it = shard_manager.find_ep_manager(de.name);
-                    if (it != shard_manager.ep_managers_end()) {
-                        return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
-                             return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
-                        });
-                    } else {
-                        return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
-                    }
-                });
-            }).then([this] {
-                return do_for_each(_per_device_limits_map, [this](per_device_limits_map::value_type& per_device_limits_entry) {
-                    space_watchdog::per_device_limits& per_device_limits = per_device_limits_entry.second;
-
-                    size_t adjusted_quota = 0;
-                    size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
-                        return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
+                // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
+                // not hintable).
+                // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
+                // continue to enumeration - there is no one to change them.
+                auto it = shard_manager.find_ep_manager(de.name);
+                if (it != shard_manager.ep_managers_end()) {
+                    return with_file_update_mutex(it->second, [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)] () mutable {
+                        return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
                    });
-                    if (per_device_limits.max_shard_disk_space_size > delta) {
-                        adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
-                    }
+                } else {
+                    return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
+                }
+            }).get();
+        }

-                    bool can_hint = _total_size < adjusted_quota;
-                    resource_manager_logger.trace("space_watchdog: total_size ({}) {} max_shard_disk_space_size ({})", _total_size, can_hint ? "<" : ">=", adjusted_quota);
-
-                    if (!can_hint) {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.forbid_hints_for_eps_with_pending_hints();
-                        }
-                    } else {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.allow_hints();
-                        }
-    }
-                });
-            });
-        }).handle_exception([this] (auto eptr) {
-            resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
-            // Stop all hint generators if space_watchdog callback failed
-            for (manager& shard_manager : _shard_managers) {
-                shard_manager.forbid_hints();
-            }
-        }).finally([this] {
-            _timer.arm(_watchdog_period);
+        // Adjust the quota to take into account the space we guarantee to every end point manager
+        size_t adjusted_quota = 0;
+        size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
+            return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
        });
-    });
+        if (per_device_limits.max_shard_disk_space_size > delta) {
+            adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
+        }
+
+        resource_manager_logger.trace("space_watchdog: consuming {}/{} bytes", _total_size, adjusted_quota);
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.update_backlog(_total_size, adjusted_quota);
+        }
+    }
 }

 future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr) {
@@ -183,6 +183,10 @@ future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, s
    });
 }

+void resource_manager::allow_replaying() noexcept {
+    boost::for_each(_shard_managers, [] (manager& m) { m.allow_replaying(); });
+}
+
 future<> resource_manager::stop() noexcept {
    return parallel_for_each(_shard_managers, [](manager& m) {
        return m.stop();
@@ -201,14 +205,18 @@ future<> resource_manager::prepare_per_device_limits() {
        auto it = _per_device_limits_map.find(device_id);
        if (it == _per_device_limits_map.end()) {
            return is_mountpoint(shard_manager.hints_dir().parent_path()).then([this, device_id, &shard_manager](bool is_mountpoint) {
-                // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
-                size_t max_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
-                // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
-                // Then, reserve 90% of all space instead of 10% above.
-                if (is_mountpoint) {
-                    max_size *= 9;
+                auto [it, inserted] = _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{});
+                // Since we possibly deferred, we need to recheck the _per_device_limits_map.
+                if (inserted) {
+                    // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
+                    it->second.max_shard_disk_space_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
+                    // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
+                    // Then, reserve 90% of all space instead of 10% above.
+                    if (is_mountpoint) {
+                        it->second.max_shard_disk_space_size *= 9;
+                    }
                }
-                _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{{std::ref(shard_manager)}, max_size});
+                it->second.managers.emplace_back(std::ref(shard_manager));
            });
        } else {
            it->second.managers.emplace_back(std::ref(shard_manager));
--- a/db/hints/resource_manager.hh
+++ b/db/hints/resource_manager.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include <cstdint>
+#include <seastar/core/abort_source.hh>
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/gate.hh>
 #include <seastar/core/memory.hh>
@@ -78,8 +79,8 @@ private:
    shard_managers_set& _shard_managers;
    per_device_limits_map& _per_device_limits_map;

-    seastar::gate _gate;
-    seastar::timer<timer_clock_type> _timer;
+    future<> _started = make_ready_future<>();
+    seastar::abort_source _as;
    int _files_count = 0;

 public:
@@ -137,6 +138,9 @@ public:
        , _space_watchdog(_shard_managers, _per_device_limits_map)
    {}

+    resource_manager(resource_manager&&) = delete;
+    resource_manager& operator=(resource_manager&&) = delete;
+
    future<semaphore_units<semaphore_default_exception_factory>> get_send_units_for(size_t buf_size);

    bool too_many_hints_in_progress() const {
@@ -156,6 +160,7 @@ public:
    }

    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
+    void allow_replaying() noexcept;
    future<> stop() noexcept;
    void register_manager(manager& m);
    future<> prepare_per_device_limits();
--- a/Show More
+++ b/Show More