sstable/compaction: Use correct schema in the writing consumer

Introduced in 2a437ab427. regular_compaction::select_sstable_writer() creates the sstable writer when the first partition is consumed from the combined mutation fragment stream. It gets the schema directly from the table object. That may be a different schema than the one used by the readers if there was a concurrent schema alter duringthat small time window. As a result, the writing consumer attached to readers will interpret fragments using the wrong version of the schema. One effect of this is storing values of some columns under a different column. This patch replaces all column_family::schema() accesses with accesses to the _schema memeber which is obtained once per compaction and is the same schema which readers use. Fixes #4304. Tests: - manual tests with hard-coded schema change injection to reproduce the bug - build/dev/scylla boot - tests/sstable_mutation_test Message-Id: <1551698056-23386-1-git-send-email-tgrabiec@scylladb.com> (cherry picked from commit 58e7ad20eb)
Merge "Fix commitlog chunks overwriting each other" from Paweł
2019-03-04 18:16:43 +02:00 · 2019-03-04 17:58:46 +02:00 · 2019-03-04 10:14:33 +02:00 · 2019-02-27 22:17:44 +02:00 · 2019-02-25 23:22:09 +02:00 · 2019-02-24 15:45:32 +02:00
755 changed files with 27361 additions and 9533 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,14 +1,14 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
 	url = ../scylla-swagger-ui
 	ignore = dirty
-[submodule "dist/ami/files/scylla-ami"]
-	path = dist/ami/files/scylla-ami
-	url = ../scylla-ami
 [submodule "xxHash"]
 	path = xxHash
 	url = ../xxHash
+[submodule "libdeflate"]
+	path = libdeflate
+	url = ../libdeflate
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,4 +138,5 @@ target_include_directories(scylla PUBLIC
        ${SEASTAR_INCLUDE_DIRS}
        ${Boost_INCLUDE_DIRS}
        xxhash
+        libdeflate
        build/release/gen)
--- a/HACKING.md
+++ b/HACKING.md
@@ -20,7 +20,7 @@ $ git submodule update --init --recursive

 Scylla depends on the system package manager for its development dependencies.

-Running `./install_dependencies.sh` (as root) installs the appropriate packages based on your Linux distribution.
+Running `./install-dependencies.sh` (as root) installs the appropriate packages based on your Linux distribution.

 ### Build system

--- a/README.md
+++ b/README.md
@@ -50,12 +50,12 @@ Then, to build an RPM, run:
 ./dist/redhat/build_rpm.sh
 ```

-The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
+The built RPM is stored in the ``build/mock/<configuration>/result`` directory.
 For example, on Fedora 21 mock reports the following:

 ```
 INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
-INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
+INFO: Results and/or logs in: build/mock/fedora-21-x86_64/result
 ```

 ## Building Fedora-based Docker image
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=3.0.4

 if test -f version
 then
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -2228,11 +2228,11 @@
               "description":"The column family"
            },
            "total":{
-               "type":"int",
+               "type":"long",
               "description":"The total snapshot size"
            },
            "live":{
-               "type":"int",
+               "type":"long",
               "description":"The live snapshot size"
            }
         }
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -78,15 +78,17 @@ void set_storage_service(http_context& ctx, routes& r) {
        });
    });

-    ss::get_tokens.set(r, [] (const_req req) {
-        auto tokens = service::get_local_storage_service().get_token_metadata().sorted_tokens();
-        return container_to_vec(tokens);
+    ss::get_tokens.set(r, [] (std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().sorted_tokens(), [](const dht::token& i) {
+           return boost::lexical_cast<std::string>(i);
+        }));
    });

-    ss::get_node_tokens.set(r, [] (const_req req) {
-        gms::inet_address addr(req.param["endpoint"]);
-        auto tokens = service::get_local_storage_service().get_token_metadata().get_tokens(addr);
-        return container_to_vec(tokens);
+    ss::get_node_tokens.set(r, [] (std::unique_ptr<request> req) {
+        gms::inet_address addr(req->param["endpoint"]);
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().get_tokens(addr), [](const dht::token& i) {
+           return boost::lexical_cast<std::string>(i);
+       }));
    });

    ss::get_commitlog.set(r, [&ctx](const_req req) {
@@ -107,11 +109,7 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
-        auto points = service::get_local_storage_service().get_token_metadata().get_moving_endpoints();
        std::unordered_set<sstring> addr;
-        for (auto i: points) {
-            addr.insert(boost::lexical_cast<std::string>(i.second));
-        }
        return container_to_vec(addr);
    });

--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -55,6 +55,15 @@ atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_typ
    );
 }

+atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value, collection_member cm)
+{
+    auto& imr_data = type.imr_state();
+    return atomic_cell(
+        imr_data.type_info(),
+        imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, bool(cm)), &imr_data.lsa_migrator())
+    );
+}
+
 atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, bytes_view value,
                             gc_clock::time_point expiry, gc_clock::duration ttl, atomic_cell::collection_member cm) {
    auto& imr_data = type.imr_state();
@@ -73,6 +82,16 @@ atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_typ
    );
 }

+atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value,
+                                   gc_clock::time_point expiry, gc_clock::duration ttl, collection_member cm)
+{
+    auto& imr_data = type.imr_state();
+    return atomic_cell(
+        imr_data.type_info(),
+        imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, expiry, ttl, bool(cm)), &imr_data.lsa_migrator())
+    );
+}
+
 atomic_cell atomic_cell::make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
    auto& imr_data = no_type_imr_descriptor();
    return atomic_cell(
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -33,6 +33,7 @@
 #include "data/cell.hh"
 #include "data/schema_info.hh"
 #include "imr/utils.hh"
+#include "utils/fragmented_temporary_buffer.hh"

 #include "serializer.hh"

@@ -190,6 +191,8 @@ public:
                                 collection_member = collection_member::no);
    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
                                 collection_member = collection_member::no);
+    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value,
+                                 collection_member = collection_member::no);
    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const bytes& value,
                                 collection_member cm = collection_member::no) {
        return make_live(type, timestamp, bytes_view(value), cm);
@@ -199,6 +202,8 @@ public:
        gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
    static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
        gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
+    static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value,
+        gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
    static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const bytes& value,
                                 gc_clock::time_point expiry, gc_clock::duration ttl, collection_member cm = collection_member::no)
    {
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -28,6 +28,7 @@
 #include "database.hh"
 #include "schema_builder.hh"
 #include "service/migration_manager.hh"
+#include "timeout_config.hh"

 namespace auth {

@@ -86,12 +87,24 @@ future<> create_metadata_table_if_missing(
    return mm.announce_new_column_family(b.build(), false);
 }

-future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db) {
+future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
    static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };

-    return do_until([&db] { return db.get_version() != database::empty_version; }, pause).then([&mm] {
-        return do_until([&mm] { return mm.have_schema_agreement(); }, pause);
+    return do_until([&db, &as] {
+        as.check();
+        return db.get_version() != database::empty_version;
+    }, pause).then([&mm, &as] {
+        return do_until([&mm, &as] {
+            as.check();
+            return mm.have_schema_agreement();
+        }, pause);
    });
 }

+const timeout_config& internal_distributed_timeout_config() noexcept {
+    static const auto t = 5s;
+    static const timeout_config tc{t, t, t, t, t, t, t};
+    return tc;
+}
+
 }
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -38,6 +38,7 @@
 using namespace std::chrono_literals;

 class database;
+class timeout_config;

 namespace service {
 class migration_manager;
@@ -80,6 +81,11 @@ future<> create_metadata_table_if_missing(
        stdx::string_view cql,
        ::service::migration_manager&);

-future<> wait_for_schema_agreement(::service::migration_manager&, const database&);
+future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);
+
+///
+/// Time-outs for internal, non-local CQL queries.
+///
+const timeout_config& internal_distributed_timeout_config() noexcept;

 }
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -160,7 +160,7 @@ future<> default_authorizer::start() {
                _migration_manager).then([this] {
            _finished = do_after_system_ready(_as, [this] {
                return async([this] {
-                    wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                    if (legacy_metadata_exists()) {
                        if (!any_granted().get0()) {
@@ -178,7 +178,7 @@ future<> default_authorizer::start() {

 future<> default_authorizer::stop() {
    _as.request_abort();
-    return _finished.handle_exception_type([](const sleep_aborted&) {});
+    return _finished.handle_exception_type([](const sleep_aborted&) {}).handle_exception_type([](const abort_requested_exception&) {});
 }

 future<permission_set>
@@ -228,7 +228,7 @@ default_authorizer::modify(
        return _qp.process(
                query,
                db::consistency_level::ONE,
-                infinite_timeout_config,
+                internal_distributed_timeout_config(),
                {permissions::to_strings(set), sstring(role_name), resource.name()}).discard_result();
    });
 }
@@ -254,7 +254,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
    return _qp.process(
            query,
            db::consistency_level::ONE,
-            infinite_timeout_config,
+            internal_distributed_timeout_config(),
            {},
            true).then([](::shared_ptr<cql3::untyped_result_set> results) {
        std::vector<permission_details> all_details;
@@ -282,7 +282,7 @@ future<> default_authorizer::revoke_all(stdx::string_view role_name) const {
    return _qp.process(
            query,
            db::consistency_level::ONE,
-            infinite_timeout_config,
+            internal_distributed_timeout_config(),
            {sstring(role_name)}).discard_result().handle_exception([role_name](auto ep) {
        try {
            std::rethrow_exception(ep);
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -41,11 +41,6 @@

 #include "auth/password_authenticator.hh"

-extern "C" {
-#include <crypt.h>
-#include <unistd.h>
-}
-
 #include <algorithm>
 #include <chrono>
 #include <random>
@@ -55,6 +50,7 @@ extern "C" {

 #include "auth/authenticated_user.hh"
 #include "auth/common.hh"
+#include "auth/passwords.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/untyped_result_set.hh"
 #include "log.hh"
@@ -82,6 +78,8 @@ static const class_registrator<
        cql3::query_processor&,
        ::service::migration_manager&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");

+static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());
+
 password_authenticator::~password_authenticator() {
 }

@@ -91,76 +89,6 @@ password_authenticator::password_authenticator(cql3::query_processor& qp, ::serv
    , _stopped(make_ready_future<>()) {
 }

-// TODO: blowfish
-// Origin uses Java bcrypt library, i.e. blowfish salt
-// generation and hashing, which is arguably a "better"
-// password hash than sha/md5 versions usually available in
-// crypt_r. Otoh, glibc 2.7+ uses a modified sha512 algo
-// which should be the same order of safe, so the only
-// real issue should be salted hash compatibility with
-// origin if importing system tables from there.
-//
-// Since bcrypt/blowfish is _not_ (afaict) not available
-// as a dev package/lib on most linux distros, we'd have to
-// copy and compile for example OWL  crypto
-// (http://cvsweb.openwall.com/cgi/cvsweb.cgi/Owl/packages/glibc/crypt_blowfish/)
-// to be fully bit-compatible.
-//
-// Until we decide this is needed, let's just use crypt_r,
-// and some old-fashioned random salt generation.
-
-static constexpr size_t rand_bytes = 16;
-static thread_local crypt_data tlcrypt = { 0, };
-
-static sstring hashpw(const sstring& pass, const sstring& salt) {
-    auto res = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
-    if (res == nullptr) {
-        throw std::system_error(errno, std::system_category());
-    }
-    return res;
-}
-
-static bool checkpw(const sstring& pass, const sstring& salted_hash) {
-    auto tmp = hashpw(pass, salted_hash);
-    return tmp == salted_hash;
-}
-
-static sstring gensalt() {
-    static sstring prefix;
-
-    std::random_device rd;
-    std::default_random_engine e1(rd());
-    std::uniform_int_distribution<char> dist;
-
-    sstring valid_salt = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./";
-    sstring input(rand_bytes, 0);
-
-    for (char&c : input) {
-        c = valid_salt[dist(e1) % valid_salt.size()];
-    }
-
-    sstring salt;
-
-    if (!prefix.empty()) {
-        return prefix + input;
-    }
-
-    // Try in order:
-    // blowfish 2011 fix, blowfish, sha512, sha256, md5
-    for (sstring pfx : { "$2y$", "$2a$", "$6$", "$5$", "$1$" }) {
-        salt = pfx + input;
-        if (crypt_r("fisk", salt.c_str(), &tlcrypt)) {
-            prefix = pfx;
-            return salt;
-        }
-    }
-    throw std::runtime_error("Could not initialize hashing algorithm");
-}
-
-static sstring hashpw(const sstring& pass) {
-    return hashpw(pass, gensalt());
-}
-
 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
    return !row.get_or<sstring>(SALTED_HASH, "").empty();
 }
@@ -184,7 +112,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
    return _qp.process(
            query,
            db::consistency_level::QUORUM,
-            infinite_timeout_config).then([this](::shared_ptr<cql3::untyped_result_set> results) {
+            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
        return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
            auto username = row.get_as<sstring>("username");
            auto salted_hash = row.get_as<sstring>(SALTED_HASH);
@@ -192,7 +120,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
            return _qp.process(
                    update_row_query,
                    consistency_for_user(username),
-                    infinite_timeout_config,
+                    internal_distributed_timeout_config(),
                    {std::move(salted_hash), username}).discard_result();
        }).finally([results] {});
    }).then([] {
@@ -209,8 +137,8 @@ future<> password_authenticator::create_default_if_missing() const {
            return _qp.process(
                    update_row_query,
                    db::consistency_level::QUORUM,
-                    infinite_timeout_config,
-                    {hashpw(DEFAULT_USER_PASSWORD), DEFAULT_USER_NAME}).then([](auto&&) {
+                    internal_distributed_timeout_config(),
+                    {passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt), DEFAULT_USER_NAME}).then([](auto&&) {
                plogger.info("Created default superuser authentication record.");
            });
        }
@@ -221,8 +149,6 @@ future<> password_authenticator::create_default_if_missing() const {

 future<> password_authenticator::start() {
     return once_among_shards([this] {
-         gensalt(); // do this once to determine usable hashing
-
         auto f = create_metadata_table_if_missing(
                 meta::roles_table::name,
                 _qp,
@@ -231,7 +157,7 @@ future<> password_authenticator::start() {

         _stopped = do_after_system_ready(_as, [this] {
             return async([this] {
-                 wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                 wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                 if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
                     if (legacy_metadata_exists()) {
@@ -256,7 +182,7 @@ future<> password_authenticator::start() {

 future<> password_authenticator::stop() {
    _as.request_abort();
-    return _stopped.handle_exception_type([] (const sleep_aborted&) { });
+    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
 }

 db::consistency_level password_authenticator::consistency_for_user(stdx::string_view role_name) {
@@ -309,13 +235,17 @@ future<authenticated_user> password_authenticator::authenticate(
        return _qp.process(
                query,
                consistency_for_user(username),
-                infinite_timeout_config,
+                internal_distributed_timeout_config(),
                {username},
                true);
    }).then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
        try {
            auto res = f.get0();
-            if (res->empty() || !checkpw(password, res->one().get_as<sstring>(SALTED_HASH))) {
+            auto salted_hash = std::experimental::optional<sstring>();
+            if (!res->empty()) {
+                salted_hash = res->one().get_opt<sstring>(SALTED_HASH);
+            }
+            if (!salted_hash || !passwords::check(password, *salted_hash)) {
                throw exceptions::authentication_exception("Username and/or password are incorrect");
            }
            return make_ready_future<authenticated_user>(username);
@@ -337,8 +267,8 @@ future<> password_authenticator::create(stdx::string_view role_name, const authe
    return _qp.process(
            update_row_query,
            consistency_for_user(role_name),
-            infinite_timeout_config,
-            {hashpw(*options.password), sstring(role_name)}).discard_result();
+            internal_distributed_timeout_config(),
+            {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}).discard_result();
 }

 future<> password_authenticator::alter(stdx::string_view role_name, const authentication_options& options) const {
@@ -355,8 +285,8 @@ future<> password_authenticator::alter(stdx::string_view role_name, const authen
    return _qp.process(
            query,
            consistency_for_user(role_name),
-            infinite_timeout_config,
-            {hashpw(*options.password), sstring(role_name)}).discard_result();
+            internal_distributed_timeout_config(),
+            {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}).discard_result();
 }

 future<> password_authenticator::drop(stdx::string_view name) const {
@@ -366,7 +296,10 @@ future<> password_authenticator::drop(stdx::string_view name) const {
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.process(query, consistency_for_user(name), infinite_timeout_config, {sstring(name)}).discard_result();
+    return _qp.process(
+            query, consistency_for_user(name),
+            internal_distributed_timeout_config(),
+            {sstring(name)}).discard_result();
 }

 future<custom_options> password_authenticator::query_custom_options(stdx::string_view role_name) const {
--- a/auth/passwords.cc
+++ b/auth/passwords.cc
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "auth/passwords.hh"
+
+#include <cerrno>
+#include <optional>
+
+extern "C" {
+#include <crypt.h>
+#include <unistd.h>
+}
+
+namespace auth::passwords {
+
+static thread_local crypt_data tlcrypt = { 0, };
+
+namespace detail {
+
+scheme identify_best_supported_scheme() {
+    const auto all_schemes = { scheme::bcrypt_y, scheme::bcrypt_a, scheme::sha_512, scheme::sha_256, scheme::md5 };
+    // "Random", for testing schemes.
+    const sstring random_part_of_salt = "aaaabbbbccccdddd";
+
+    for (scheme c : all_schemes) {
+        const sstring salt = sstring(prefix_for_scheme(c)) + random_part_of_salt;
+        const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
+
+        if (e && (e[0] != '*')) {
+            return c;
+        }
+    }
+
+    throw no_supported_schemes();
+}
+
+sstring hash_with_salt(const sstring& pass, const sstring& salt) {
+    auto res = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
+    if (!res || (res[0] == '*')) {
+        throw std::system_error(errno, std::system_category());
+    }
+    return res;
+}
+
+const char* prefix_for_scheme(scheme c) noexcept {
+    switch (c) {
+    case scheme::bcrypt_y: return "$2y$";
+    case scheme::bcrypt_a: return "$2a$";
+    case scheme::sha_512: return "$6$";
+    case scheme::sha_256: return "$5$";
+    case scheme::md5: return "$1$";
+    default: return nullptr;
+    }
+}
+
+} // namespace detail
+
+no_supported_schemes::no_supported_schemes()
+        : std::runtime_error("No allowed hashing schemes are supported on this system") {
+}
+
+bool check(const sstring& pass, const sstring& salted_hash) {
+    return detail::hash_with_salt(pass, salted_hash) == salted_hash;
+}
+
+} // namespace auth::paswords
--- a/auth/passwords.hh
+++ b/auth/passwords.hh
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <random>
+#include <stdexcept>
+
+#include <seastar/core/sstring.hh>
+
+#include "seastarx.hh"
+
+namespace auth::passwords {
+
+class no_supported_schemes : public std::runtime_error {
+public:
+    no_supported_schemes();
+};
+
+///
+/// Apache Cassandra uses a library to provide the bcrypt scheme. Many Linux implementations do not support bcrypt, so
+/// we support alternatives. The cost is loss of direct compatibility with Apache Cassandra system tables.
+///
+enum class scheme {
+    bcrypt_y,
+    bcrypt_a,
+    sha_512,
+    sha_256,
+    md5
+};
+
+namespace detail {
+
+template <typename RandomNumberEngine>
+sstring generate_random_salt_bytes(RandomNumberEngine& g) {
+    static const sstring valid_bytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./";
+    static constexpr std::size_t num_bytes = 16;
+    std::uniform_int_distribution<std::size_t> dist(0, valid_bytes.size() - 1);
+    sstring result(num_bytes, 0);
+
+    for (char& c : result) {
+        c = valid_bytes[dist(g)];
+    }
+
+    return result;
+}
+
+///
+/// Test each allowed hashing scheme and report the best supported one on the current system.
+///
+/// \throws \ref no_supported_schemes when none of the known schemes is supported.
+///
+scheme identify_best_supported_scheme();
+
+const char* prefix_for_scheme(scheme) noexcept;
+
+///
+/// Generate a implementation-specific salt string for hashing passwords.
+///
+/// The `RandomNumberEngine` is used to generate the string, which is an implementation-specific length.
+///
+/// \throws \ref no_supported_schemes when no known hashing schemes are supported on the system.
+///
+template <typename RandomNumberEngine>
+sstring generate_salt(RandomNumberEngine& g) {
+    static const scheme scheme = identify_best_supported_scheme();
+    static const sstring prefix = sstring(prefix_for_scheme(scheme));
+    return prefix + generate_random_salt_bytes(g);
+}
+
+///
+/// Hash a password combined with an implementation-specific salt string.
+///
+/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
+///
+sstring hash_with_salt(const sstring& pass, const sstring& salt);
+
+} // namespace detail
+
+///
+/// Run a one-way hashing function on cleartext to produce encrypted text.
+///
+/// Prior to applying the hashing function, random salt is amended to the cleartext. The random salt bytes are generated
+/// according to the random number engine `g`.
+///
+/// The result is the encrypted cyphertext, and also the salt used but in a implementation-specific format.
+///
+/// \throws \ref std::system_error when the implementation-specific implementation fails to hash the cleartext.
+///
+template <typename RandomNumberEngine>
+sstring hash(const sstring& pass, RandomNumberEngine& g) {
+    return detail::hash_with_salt(pass, detail::generate_salt(g));
+}
+
+///
+/// Check that cleartext matches previously hashed cleartext with salt.
+///
+/// \ref salted_hash is the result of invoking \ref hash, which is the implementation-specific combination of the hashed
+/// password and the salt that was generated for it.
+///
+/// \returns `true` if the cleartext matches the salted hash.
+///
+/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
+///
+bool check(const sstring& pass, const sstring& salted_hash);
+
+} // namespace auth::passwords
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -79,7 +79,7 @@ future<bool> default_role_row_satisfies(
                return qp.process(
                        query,
                        db::consistency_level::QUORUM,
-                        infinite_timeout_config,
+                        internal_distributed_timeout_config(),
                        {meta::DEFAULT_SUPERUSER_NAME},
                        true).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
                    if (results->empty()) {
@@ -104,7 +104,7 @@ future<bool> any_nondefault_role_row_satisfies(
        return qp.process(
                query,
                db::consistency_level::QUORUM,
-                infinite_timeout_config).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
+                internal_distributed_timeout_config()).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
                return false;
            }
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -196,6 +196,10 @@ future<> service::start() {
 }

 future<> service::stop() {
+    // Only one of the shards has the listener registered, but let's try to
+    // unregister on each one just to make sure.
+    _migration_manager.unregister_listener(_migration_listener.get());
+
    return _permissions_cache->stop().then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
    });
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -89,7 +89,7 @@ static future<stdx::optional<record>> find_record(cql3::query_processor& qp, std
    return qp.process(
            query,
            consistency_for_role(role_name),
-            infinite_timeout_config,
+            internal_distributed_timeout_config(),
            {sstring(role_name)},
            true).then([](::shared_ptr<cql3::untyped_result_set> results) {
        if (results->empty()) {
@@ -174,7 +174,7 @@ future<> standard_role_manager::create_default_role_if_missing() const {
            return _qp.process(
                    query,
                    db::consistency_level::QUORUM,
-                    infinite_timeout_config,
+                    internal_distributed_timeout_config(),
                    {meta::DEFAULT_SUPERUSER_NAME}).then([](auto&&) {
                log.info("Created default superuser role '{}'.", meta::DEFAULT_SUPERUSER_NAME);
                return make_ready_future<>();
@@ -201,7 +201,7 @@ future<> standard_role_manager::migrate_legacy_metadata() const {
    return _qp.process(
            query,
            db::consistency_level::QUORUM,
-            infinite_timeout_config).then([this](::shared_ptr<cql3::untyped_result_set> results) {
+            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
        return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
            role_config config;
            config.is_superuser = row.get_as<bool>("super");
@@ -227,7 +227,7 @@ future<> standard_role_manager::start() {
        return this->create_metadata_tables_if_missing().then([this] {
            _stopped = auth::do_after_system_ready(_as, [this] {
                return seastar::async([this] {
-                    wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                    if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
                        if (this->legacy_metadata_exists()) {
@@ -251,7 +251,7 @@ future<> standard_role_manager::start() {

 future<> standard_role_manager::stop() {
    _as.request_abort();
-    return _stopped.handle_exception_type([] (const sleep_aborted&) { });
+    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});;
 }

 future<> standard_role_manager::create_or_replace(stdx::string_view role_name, const role_config& c) const {
@@ -263,7 +263,7 @@ future<> standard_role_manager::create_or_replace(stdx::string_view role_name, c
    return _qp.process(
            query,
            consistency_for_role(role_name),
-            infinite_timeout_config,
+            internal_distributed_timeout_config(),
            {sstring(role_name), c.is_superuser, c.can_login},
            true).discard_result();
 }
@@ -307,7 +307,7 @@ standard_role_manager::alter(stdx::string_view role_name, const role_config_upda
                        build_column_assignments(u),
                        meta::roles_table::role_col_name),
                consistency_for_role(role_name),
-                infinite_timeout_config,
+                internal_distributed_timeout_config(),
                {sstring(role_name)}).discard_result();
    });
 }
@@ -327,7 +327,7 @@ future<> standard_role_manager::drop(stdx::string_view role_name) const {
            return _qp.process(
                    query,
                    consistency_for_role(role_name),
-                    infinite_timeout_config,
+                    internal_distributed_timeout_config(),
                    {sstring(role_name)}).then([this, role_name](::shared_ptr<cql3::untyped_result_set> members) {
                return parallel_for_each(
                        members->begin(),
@@ -367,7 +367,7 @@ future<> standard_role_manager::drop(stdx::string_view role_name) const {
            return _qp.process(
                    query,
                    consistency_for_role(role_name),
-                    infinite_timeout_config,
+                    internal_distributed_timeout_config(),
                    {sstring(role_name)}).discard_result();
        };

@@ -394,7 +394,7 @@ standard_role_manager::modify_membership(
        return _qp.process(
                query,
                consistency_for_role(grantee_name),
-                infinite_timeout_config,
+                internal_distributed_timeout_config(),
                {role_set{sstring(role_name)}, sstring(grantee_name)}).discard_result();
    };

@@ -406,7 +406,7 @@ standard_role_manager::modify_membership(
                                "INSERT INTO %s (role, member) VALUES (?, ?)",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
-                        infinite_timeout_config,
+                        internal_distributed_timeout_config(),
                        {sstring(role_name), sstring(grantee_name)}).discard_result();

            case membership_change::remove:
@@ -415,7 +415,7 @@ standard_role_manager::modify_membership(
                                "DELETE FROM %s WHERE role = ? AND member = ?",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
-                        infinite_timeout_config,
+                        internal_distributed_timeout_config(),
                        {sstring(role_name), sstring(grantee_name)}).discard_result();
        }

@@ -516,7 +516,10 @@ future<role_set> standard_role_manager::query_all() const {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

-    return _qp.process(query, db::consistency_level::QUORUM, infinite_timeout_config).then([](::shared_ptr<cql3::untyped_result_set> results) {
+    return _qp.process(
+            query,
+            db::consistency_level::QUORUM,
+            internal_distributed_timeout_config()).then([](::shared_ptr<cql3::untyped_result_set> results) {
        role_set roles;

        std::transform(
--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -77,7 +77,7 @@ protected:
        , _io_priority(iop)
        , _interval(interval)
        , _update_timer([this] { adjust(); })
-        , _control_points({{0,0}})
+        , _control_points()
        , _current_backlog(std::move(backlog))
        , _inflight_update(make_ready_future<>())
    {
@@ -125,7 +125,7 @@ public:
    flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
    flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
        : backlog_controller(sg, iop, std::move(interval),
-          std::vector<backlog_controller::control_point>({{soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
+          std::vector<backlog_controller::control_point>({{0.0, 0.0}, {soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
          std::move(current_dirty)
        )
    {}
@@ -139,7 +139,7 @@ public:
    compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
    compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, std::function<float()> current_backlog)
        : backlog_controller(sg, iop, std::move(interval),
-          std::vector<backlog_controller::control_point>({{0.5, 10}, {1.5, 100} , {normalization_factor, 1000}}),
+          std::vector<backlog_controller::control_point>({{0.0, 50}, {1.5, 100} , {normalization_factor, 1000}}),
          std::move(current_backlog)
        )
    {}
--- a/bytes.hh
+++ b/bytes.hh
@@ -35,6 +35,10 @@ using bytes_mutable_view = basic_mutable_view<bytes_view::value_type>;
 using bytes_opt = std::experimental::optional<bytes>;
 using sstring_view = std::experimental::string_view;

+inline sstring_view to_sstring_view(bytes_view view) {
+    return {reinterpret_cast<const char*>(view.data()), view.size()};
+}
+
 namespace std {

 template <>
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -38,7 +38,7 @@ class bytes_ostream {
 public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
-    static constexpr size_type max_chunk_size() { return 16 * 1024; }
+    static constexpr size_type max_chunk_size() { return 128 * 1024; }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
    struct chunk {
@@ -57,12 +57,12 @@ private:
        value_type data[0];
        void operator delete(void* ptr) { free(ptr); }
    };
-    // FIXME: consider increasing chunk size as the buffer grows
-    static constexpr size_type chunk_size{512};
+    static constexpr size_type default_chunk_size{512};
 private:
    std::unique_ptr<chunk> _begin;
    chunk* _current;
    size_type _size;
+    size_type _initial_chunk_size = default_chunk_size;
 public:
    class fragment_iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
        chunk* _current = nullptr;
@@ -102,13 +102,13 @@ private:
    }
    // Figure out next chunk size.
    //   - must be enough for data_size
-    //   - must be at least chunk_size
+    //   - must be at least _initial_chunk_size
    //   - try to double each time to prevent too many allocations
    //   - do not exceed max_chunk_size
    size_type next_alloc_size(size_t data_size) const {
        auto next_size = _current
                ? _current->size * 2
-                : chunk_size;
+                : _initial_chunk_size;
        next_size = std::min(next_size, max_chunk_size());
        // FIXME: check for overflow?
        return std::max<size_type>(next_size, data_size + sizeof(chunk));
@@ -116,13 +116,19 @@ private:
    // Makes room for a contiguous region of given size.
    // The region is accounted for as already written.
    // size must not be zero.
+    [[gnu::always_inline]]
    value_type* alloc(size_type size) {
-        if (size <= current_space_left()) {
+        if (__builtin_expect(size <= current_space_left(), true)) {
            auto ret = _current->data + _current->offset;
            _current->offset += size;
            _size += size;
            return ret;
        } else {
+            return alloc_new(size);
+        }
+    }
+    [[gnu::noinline]]
+    value_type* alloc_new(size_type size) {
            auto alloc_size = next_alloc_size(size);
            auto space = malloc(alloc_size);
            if (!space) {
@@ -140,19 +146,22 @@ private:
            }
            _size += size;
            return _current->data;
-        };
    }
 public:
-    bytes_ostream() noexcept
+    explicit bytes_ostream(size_t initial_chunk_size) noexcept
        : _begin()
        , _current(nullptr)
        , _size(0)
+        , _initial_chunk_size(initial_chunk_size)
    { }

+    bytes_ostream() noexcept : bytes_ostream(default_chunk_size) {}
+
    bytes_ostream(bytes_ostream&& o) noexcept
        : _begin(std::move(o._begin))
        , _current(o._current)
        , _size(o._size)
+        , _initial_chunk_size(o._initial_chunk_size)
    {
        o._current = nullptr;
        o._size = 0;
@@ -162,6 +171,7 @@ public:
        : _begin()
        , _current(nullptr)
        , _size(0)
+        , _initial_chunk_size(o._initial_chunk_size)
    {
        append(o);
    }
@@ -199,18 +209,20 @@ public:
        return place_holder<T>{alloc(sizeof(T))};
    }

+    [[gnu::always_inline]]
    value_type* write_place_holder(size_type size) {
        return alloc(size);
    }

    // Writes given sequence of bytes
+    [[gnu::always_inline]]
    inline void write(bytes_view v) {
        if (v.empty()) {
            return;
        }

        auto this_size = std::min(v.size(), size_t(current_space_left()));
-        if (this_size) {
+        if (__builtin_expect(this_size, true)) {
            memcpy(_current->data + _current->offset, v.begin(), this_size);
            _current->offset += this_size;
            _size += this_size;
@@ -219,11 +231,12 @@ public:

        while (!v.empty()) {
            auto this_size = std::min(v.size(), size_t(max_chunk_size()));
-            std::copy_n(v.begin(), this_size, alloc(this_size));
+            std::copy_n(v.begin(), this_size, alloc_new(this_size));
            v.remove_prefix(this_size);
        }
    }

+    [[gnu::always_inline]]
    void write(const char* ptr, size_t size) {
        write(bytes_view(reinterpret_cast<const signed char*>(ptr), size));
    }
@@ -393,6 +406,21 @@ public:
    bool operator!=(const bytes_ostream& other) const {
        return !(*this == other);
    }
+
+    // Makes this instance empty.
+    //
+    // The first buffer is not deallocated, so callers may rely on the
+    // fact that if they write less than the initial chunk size between
+    // the clear() calls then writes will not involve any memory allocations,
+    // except for the first write made on this instance.
+    void clear() {
+        if (_begin) {
+            _begin->offset = 0;
+            _size = 0;
+            _current = _begin.get();
+            _begin->next.reset();
+        }
+    }
 };

 template<>
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -60,6 +60,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
        // - _next_row_in_range = _next.position() < _upper_bound
        // - _last_row points at a direct predecessor of the next row which is going to be read.
        //   Used for populating continuity.
+        // - _population_range_starts_before_all_rows is set accordingly
        reading_from_underlying,

        end_of_stream
@@ -86,6 +87,13 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
    partition_snapshot_row_cursor _next_row;
    bool _next_row_in_range = false;

+    // True iff current population interval, since the previous clustering row, starts before all clustered rows.
+    // We cannot just look at _lower_bound, because emission of range tombstones changes _lower_bound and
+    // because we mark clustering intervals as continuous when consuming a clustering_row, it would prevent
+    // us from marking the interval as continuous.
+    // Valid when _state == reading_from_underlying.
+    bool _population_range_starts_before_all_rows;
+
    // Whether _lower_bound was changed within current fill_buffer().
    // If it did not then we cannot break out of it (e.g. on preemption) because
    // forward progress is not guaranteed in case iterators are getting constantly invalidated.
@@ -228,6 +236,7 @@ inline
 future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
    if (_state == state::move_to_underlying) {
        _state = state::reading_from_underlying;
+        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
@@ -352,12 +361,12 @@ future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::tim
                }
            });
            return make_ready_future<>();
-        });
+        }, timeout);
 }

 inline
 bool cache_flat_mutation_reader::ensure_population_lower_bound() {
-    if (!_ck_ranges_curr->start()) {
+    if (_population_range_starts_before_all_rows) {
        return true;
    }
    if (!_last_row.refresh(*_snp)) {
@@ -412,6 +421,7 @@ inline
 void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
    if (!can_populate()) {
        _last_row = nullptr;
+        _population_range_starts_before_all_rows = false;
        _read_context->cache().on_mispopulate();
        return;
    }
@@ -445,6 +455,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
        with_allocator(standard_allocator(), [&] {
            _last_row = partition_snapshot_row_weakref(*_snp, it, true);
        });
+        _population_range_starts_before_all_rows = false;
    });
 }

--- a/cell_locking.hh
+++ b/cell_locking.hh
@@ -24,9 +24,9 @@
 #include <boost/intrusive/unordered_set.hpp>

 #include "utils/small_vector.hh"
-#include "fnv1a_hasher.hh"
 #include "mutation_fragment.hh"
 #include "mutation_partition.hh"
+#include "xx_hasher.hh"

 #include "db/timeout_clock.hh"

@@ -194,10 +194,10 @@ private:
            explicit hasher(const schema& s) : _schema(&s) { }

            size_t operator()(const cell_address& ca) const {
-                fnv1a_hasher hasher;
+                xx_hasher hasher;
                ca.position.feed_hash(hasher, *_schema);
                ::feed_hash(hasher, ca.id);
-                return hasher.finalize();
+                return static_cast<size_t>(hasher.finalize_uint64());
            }
            size_t operator()(const cell_entry& ce) const {
                return operator()(ce._address);
--- a/clustering_key_filter.hh
+++ b/clustering_key_filter.hh
@@ -30,7 +30,7 @@ namespace query {

 class clustering_key_filter_ranges {
    clustering_row_ranges _storage;
-    const clustering_row_ranges& _ref;
+    std::reference_wrapper<const clustering_row_ranges> _ref;
 public:
    clustering_key_filter_ranges(const clustering_row_ranges& ranges) : _ref(ranges) { }
    struct reversed { };
@@ -39,21 +39,21 @@ public:

    clustering_key_filter_ranges(clustering_key_filter_ranges&& other) noexcept
        : _storage(std::move(other._storage))
-        , _ref(&other._ref == &other._storage ? _storage : other._ref)
+        , _ref(&other._ref.get() == &other._storage ? _storage : other._ref.get())
    { }

    clustering_key_filter_ranges& operator=(clustering_key_filter_ranges&& other) noexcept {
        if (this != &other) {
-            this->~clustering_key_filter_ranges();
-            new (this) clustering_key_filter_ranges(std::move(other));
+            _storage = std::move(other._storage);
+            _ref = (&other._ref.get() == &other._storage) ? _storage : other._ref.get();
        }
        return *this;
    }

-    auto begin() const { return _ref.begin(); }
-    auto end() const { return _ref.end(); }
-    bool empty() const { return _ref.empty(); }
-    size_t size() const { return _ref.size(); }
+    auto begin() const { return _ref.get().begin(); }
+    auto end() const { return _ref.get().end(); }
+    bool empty() const { return _ref.get().empty(); }
+    size_t size() const { return _ref.get().size(); }
    const clustering_row_ranges& ranges() const { return _ref; }

    static clustering_key_filter_ranges get_ranges(const schema& schema, const query::partition_slice& slice, const partition_key& key) {
--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -31,72 +31,61 @@
 class clustering_ranges_walker {
    const schema& _schema;
    const query::clustering_row_ranges& _ranges;
-    query::clustering_row_ranges::const_iterator _current;
-    query::clustering_row_ranges::const_iterator _end;
+    boost::iterator_range<query::clustering_row_ranges::const_iterator> _current_range;
    bool _in_current; // next position is known to be >= _current_start
    bool _with_static_row;
    position_in_partition_view _current_start;
    position_in_partition_view _current_end;
-    stdx::optional<position_in_partition> _trim;
+    std::optional<position_in_partition> _trim;
    size_t _change_counter = 1;
 private:
    bool advance_to_next_range() {
        _in_current = false;
        if (!_current_start.is_static_row()) {
-            if (_current == _end) {
+            if (!_current_range) {
                return false;
            }
-            ++_current;
+            _current_range.advance_begin(1);
        }
        ++_change_counter;
-        if (_current == _end) {
+        if (!_current_range) {
            _current_end = _current_start = position_in_partition_view::after_all_clustered_rows();
            return false;
        }
-        _current_start = position_in_partition_view::for_range_start(*_current);
-        _current_end = position_in_partition_view::for_range_end(*_current);
+        _current_start = position_in_partition_view::for_range_start(_current_range.front());
+        _current_end = position_in_partition_view::for_range_end(_current_range.front());
        return true;
    }
-public:
-    clustering_ranges_walker(const schema& s, const query::clustering_row_ranges& ranges, bool with_static_row = true)
-        : _schema(s)
-        , _ranges(ranges)
-        , _current(ranges.begin())
-        , _end(ranges.end())
-        , _in_current(with_static_row)
-        , _with_static_row(with_static_row)
-        , _current_start(position_in_partition_view::for_static_row())
-        , _current_end(position_in_partition_view::before_all_clustered_rows())
-    {
-        if (!with_static_row) {
-            if (_current == _end) {
+
+    void set_current_positions() {
+         if (!_with_static_row) {
+            if (!_current_range) {
                _current_start = position_in_partition_view::before_all_clustered_rows();
            } else {
-                _current_start = position_in_partition_view::for_range_start(*_current);
-                _current_end = position_in_partition_view::for_range_end(*_current);
+                _current_start = position_in_partition_view::for_range_start(_current_range.front());
+                _current_end = position_in_partition_view::for_range_end(_current_range.front());
            }
        }
    }
-    clustering_ranges_walker(clustering_ranges_walker&& o) noexcept
-        : _schema(o._schema)
-        , _ranges(o._ranges)
-        , _current(o._current)
-        , _end(o._end)
-        , _in_current(o._in_current)
-        , _with_static_row(o._with_static_row)
-        , _current_start(o._current_start)
-        , _current_end(o._current_end)
-        , _trim(std::move(o._trim))
-        , _change_counter(o._change_counter)
-    { }
-    clustering_ranges_walker& operator=(clustering_ranges_walker&& o) {
-        if (this != &o) {
-            this->~clustering_ranges_walker();
-            new (this) clustering_ranges_walker(std::move(o));
-        }
-        return *this;
+
+public:
+    clustering_ranges_walker(const schema& s, const query::clustering_row_ranges& ranges, bool with_static_row = true)
+            : _schema(s)
+            , _ranges(ranges)
+            , _current_range(ranges)
+            , _in_current(with_static_row)
+            , _with_static_row(with_static_row)
+            , _current_start(position_in_partition_view::for_static_row())
+            , _current_end(position_in_partition_view::before_all_clustered_rows()) {
+        set_current_positions();
    }

+    clustering_ranges_walker(const clustering_ranges_walker&) = delete;
+    clustering_ranges_walker(clustering_ranges_walker&&) = delete;
+
+    clustering_ranges_walker& operator=(const clustering_ranges_walker&) = delete;
+    clustering_ranges_walker& operator=(clustering_ranges_walker&&) = delete;
+
    // Excludes positions smaller than pos from the ranges.
    // pos should be monotonic.
    // No constraints between pos and positions passed to advance_to().
@@ -173,17 +162,15 @@ public:
            return false;
        }

-        auto i = _current;
-        while (i != _end) {
-            auto range_start = position_in_partition_view::for_range_start(*i);
+        for (const auto& rng : _current_range) {
+            auto range_start = position_in_partition_view::for_range_start(rng);
            if (!less(range_start, end)) {
                return false;
            }
-            auto range_end = position_in_partition_view::for_range_end(*i);
+            auto range_end = position_in_partition_view::for_range_end(rng);
            if (less(start, range_end)) {
                return true;
            }
-            ++i;
        }

        return false;
@@ -191,18 +178,20 @@ public:

    // Returns true if advanced past all contained positions. Any later advance_to() until reset() will return false.
    bool out_of_range() const {
-        return !_in_current && _current == _end;
+        return !_in_current && !_current_range;
    }

    // Resets the state of the walker so that advance_to() can be now called for new sequence of positions.
    // Any range trimmings still hold after this.
    void reset() {
-        auto trim = std::move(_trim);
-        auto ctr = _change_counter;
-        *this = clustering_ranges_walker(_schema, _ranges, _with_static_row);
-        _change_counter = ctr + 1;
-        if (trim) {
-            trim_front(std::move(*trim));
+        _current_range = _ranges;
+        _in_current = _with_static_row;
+        _current_start = position_in_partition_view::for_static_row();
+        _current_end = position_in_partition_view::before_all_clustered_rows();
+        set_current_positions();
+        ++_change_counter;
+        if (_trim) {
+            trim_front(*std::exchange(_trim, {}));
        }
    }

@@ -211,6 +200,11 @@ public:
        return _current_start;
    }

+    // Returns the upper bound of the last range in provided ranges set
+    position_in_partition_view uppermost_bound() const {
+        return position_in_partition_view::for_range_end(_ranges.back());
+    }
+
    // When lower_bound() changes, this also does
    // Always > 0.
    size_t lower_bound_change_counter() const {
--- a/compress.cc
+++ b/compress.cc
@@ -112,7 +112,7 @@ const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_kb";
 const sstring compression_parameters::CRC_CHECK_CHANCE = "crc_check_chance";

 compression_parameters::compression_parameters()
-    : compression_parameters(nullptr)
+    : compression_parameters(compressor::lz4)
 {}

 compression_parameters::~compression_parameters()
--- a/compress.hh
+++ b/compress.hh
@@ -118,6 +118,10 @@ public:
    std::map<sstring, sstring> get_options() const;
    bool operator==(const compression_parameters& other) const;
    bool operator!=(const compression_parameters& other) const;
+
+    static compression_parameters no_compression() {
+        return compression_parameters(nullptr);
+    }
 private:
    void validate_options(const std::map<sstring, sstring>&);
 };
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -242,6 +242,9 @@ batch_size_fail_threshold_in_kb: 50

 # The directory where hints files are stored if hinted handoff is enabled.
 # hints_directory: /var/lib/scylla/hints
+ 
+# The directory where hints files are stored for materialized-view updates
+# view_hints_directory: /var/lib/scylla/view_hints

 # See http://wiki.apache.org/cassandra/HintedHandoff
 # May either be "true" or "false" to enable globally, or contain a list
--- a/configure.py
+++ b/configure.py
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -38,44 +38,44 @@ private:
    static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
        return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
    }
+    static atomic_cell upgrade_cell(const abstract_type& new_type, const abstract_type& old_type, atomic_cell_view cell,
+                                    atomic_cell::collection_member cm = atomic_cell::collection_member::no) {
+        if (cell.is_live() && !old_type.is_counter()) {
+            if (cell.is_live_and_has_ttl()) {
+                return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl(), cm);
+            }
+            return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cm);
+        } else {
+            return atomic_cell(new_type, cell);
+        }
+    }
    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
        if (!is_compatible(new_def, old_type, kind) || cell.timestamp() <= new_def.dropped_at()) {
            return;
        }
-        auto new_cell = [&] {
-            if (cell.is_live() && !old_type->is_counter()) {
-                if (cell.is_live_and_has_ttl()) {
-                    return atomic_cell_or_collection(
-                        atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl())
-                    );
-                }
-                return atomic_cell_or_collection(
-                    atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize())
-                );
-            } else {
-                return atomic_cell_or_collection(*new_def.type, cell);
-            }
-        }();
-        dst.apply(new_def, std::move(new_cell));
+        dst.apply(new_def, upgrade_cell(*new_def.type, *old_type, cell));
    }
    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
        if (!is_compatible(new_def, old_type, kind)) {
            return;
        }
      cell.data.with_linearized([&] (bytes_view cell_bv) {
-        auto&& ctype = static_pointer_cast<const collection_type_impl>(old_type);
-        auto old_view = ctype->deserialize_mutation_form(cell_bv);
+        auto new_ctype = static_pointer_cast<const collection_type_impl>(new_def.type);
+        auto old_ctype = static_pointer_cast<const collection_type_impl>(old_type);
+        auto old_view = old_ctype->deserialize_mutation_form(cell_bv);

-        collection_type_impl::mutation_view new_view;
+        collection_type_impl::mutation new_view;
        if (old_view.tomb.timestamp > new_def.dropped_at()) {
            new_view.tomb = old_view.tomb;
        }
        for (auto& c : old_view.cells) {
            if (c.second.timestamp() > new_def.dropped_at()) {
-                new_view.cells.emplace_back(std::move(c));
+                new_view.cells.emplace_back(c.first, upgrade_cell(*new_ctype->value_comparator(), *old_ctype->value_comparator(), c.second, atomic_cell::collection_member::yes));
            }
        }
-        dst.apply(new_def, ctype->serialize_mutation_form(std::move(new_view)));
+        if (new_view.tomb || !new_view.cells.empty()) {
+            dst.apply(new_def, new_ctype->serialize_mutation_form(std::move(new_view)));
+        }
      });
    }
 public:
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -470,12 +470,13 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
        std::vector<::shared_ptr<cql3::column_identifier::raw>> column_names;
        std::vector<::shared_ptr<cql3::term::raw>> values;
        bool if_not_exists = false;
+        bool default_unset = false;
        ::shared_ptr<cql3::term::raw> json_value;
    }
    : K_INSERT K_INTO cf=columnFamilyName
-          '(' c1=cident { column_names.push_back(c1); }  ( ',' cn=cident { column_names.push_back(cn); } )* ')'
-        ( K_VALUES
-              '(' v1=term { values.push_back(v1); } ( ',' vn=term { values.push_back(vn); } )* ')'
+        ('(' c1=cident { column_names.push_back(c1); }  ( ',' cn=cident { column_names.push_back(cn); } )* ')'
+            K_VALUES
+            '(' v1=term { values.push_back(v1); } ( ',' vn=term { values.push_back(vn); } )* ')'
            ( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
            ( usingClause[attrs] )?
              {
@@ -487,13 +488,15 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
              }
        | K_JSON
          json_token=jsonValue { json_value = $json_token.value; }
+            ( K_DEFAULT K_UNSET { default_unset = true; } | K_DEFAULT K_NULL )?
            ( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
            ( usingClause[attrs] )?
              {
              $expr = ::make_shared<raw::insert_json_statement>(std::move(cf),
                                                       std::move(attrs),
                                                       std::move(json_value),
-                                                       if_not_exists);
+                                                       if_not_exists,
+                                                       default_unset);
              }
        )
    ;
@@ -1531,12 +1534,22 @@ inMarkerForTuple returns [shared_ptr<cql3::tuples::in_raw> marker]
    | ':' name=ident { $marker = new_tuple_in_bind_variables(name); }
    ;

-comparatorType returns [shared_ptr<cql3_type::raw> t]
-    : n=native_type     { $t = cql3_type::raw::from(n); }
-    | c=collection_type { $t = c; }
-    | tt=tuple_type     { $t = tt; }
+// The comparator_type rule is used for users' queries (internal=false)
+// and for internal calls from db::cql_type_parser::parse() (internal=true).
+// The latter is used for reading schemas stored in the system tables, and
+// may support additional column types that cannot be created through CQL,
+// but only internally through code. Today the only such type is "empty":
+// Scylla code internally creates columns with type "empty" or collections
+// "empty" to represent unselected columns in materialized views.
+// If a user (internal=false) tries to use "empty" as a type, it is treated -
+// as do all unknown types - as an attempt to use a user-defined type, and
+// we report this name is reserved (as for _reserved_type_names()).
+comparator_type [bool internal] returns [shared_ptr<cql3_type::raw> t]
+    : n=native_or_internal_type[internal]     { $t = cql3_type::raw::from(n); }
+    | c=collection_type[internal]   { $t = c; }
+    | tt=tuple_type[internal]       { $t = tt; }
    | id=userTypeName   { $t = cql3::cql3_type::raw::user_type(id); }
-    | K_FROZEN '<' f=comparatorType '>'
+    | K_FROZEN '<' f=comparator_type[internal] '>'
      {
        try {
            $t = cql3::cql3_type::raw::frozen(f);
@@ -1558,6 +1571,22 @@ comparatorType returns [shared_ptr<cql3_type::raw> t]
 #endif
    ;

+native_or_internal_type [bool internal] returns [shared_ptr<cql3_type> t]
+    : n=native_type     { $t = n; }
+    // The "internal" types, only supported when internal==true:
+    | K_EMPTY   {
+        if (internal) {
+            $t = cql3_type::empty;
+        } else {
+            add_recognition_error("Invalid (reserved) user type name empty");
+        }
+      }
+    ;
+
+comparatorType returns [shared_ptr<cql3_type::raw> t]
+    : tt=comparator_type[false]    { $t = tt; }
+    ;
+
 native_type returns [shared_ptr<cql3_type> t]
    : K_ASCII     { $t = cql3_type::ascii; }
    | K_BIGINT    { $t = cql3_type::bigint; }
@@ -1582,24 +1611,24 @@ native_type returns [shared_ptr<cql3_type> t]
    | K_TIME      { $t = cql3_type::time; }
    ;

-collection_type returns [shared_ptr<cql3::cql3_type::raw> pt]
-    : K_MAP  '<' t1=comparatorType ',' t2=comparatorType '>'
+collection_type [bool internal] returns [shared_ptr<cql3::cql3_type::raw> pt]
+    : K_MAP  '<' t1=comparator_type[internal] ',' t2=comparator_type[internal] '>'
        {
            // if we can't parse either t1 or t2, antlr will "recover" and we may have t1 or t2 null.
            if (t1 && t2) {
                $pt = cql3::cql3_type::raw::map(t1, t2);
            }
        }
-    | K_LIST '<' t=comparatorType '>'
+    | K_LIST '<' t=comparator_type[internal] '>'
        { if (t) { $pt = cql3::cql3_type::raw::list(t); } }
-    | K_SET  '<' t=comparatorType '>'
+    | K_SET  '<' t=comparator_type[internal] '>'
        { if (t) { $pt = cql3::cql3_type::raw::set(t); } }
    ;

-tuple_type returns [shared_ptr<cql3::cql3_type::raw> t]
+tuple_type [bool internal] returns [shared_ptr<cql3::cql3_type::raw> t]
        @init{ std::vector<shared_ptr<cql3::cql3_type::raw>> types; }
    : K_TUPLE '<'
-         t1=comparatorType { types.push_back(t1); } (',' tn=comparatorType { types.push_back(tn); })*
+         t1=comparator_type[internal] { types.push_back(t1); } (',' tn=comparator_type[internal] { types.push_back(tn); })*
      '>' { $t = cql3::cql3_type::raw::tuple(std::move(types)); }
    ;

@@ -1625,7 +1654,7 @@ unreserved_keyword returns [sstring str]

 unreserved_function_keyword returns [sstring str]
    : u=basic_unreserved_keyword { $str = u; }
-    | t=native_type              { $str = t->to_string(); }
+    | t=native_or_internal_type[true]   { $str = t->to_string(); }
    ;

 basic_unreserved_keyword returns [sstring str]
@@ -1809,6 +1838,10 @@ K_OR:          O R;
 K_REPLACE:     R E P L A C E;
 K_DETERMINISTIC: D E T E R M I N I S T I C;
 K_JSON:        J S O N;
+K_DEFAULT:     D E F A U L T;
+K_UNSET:       U N S E T;
+
+K_EMPTY:       E M P T Y;

 K_SCYLLA_TIMEUUID_LIST_INDEX: S C Y L L A '_' T I M E U U I D '_' L I S T '_' I N D E X;
 K_SCYLLA_COUNTER_SHARD_LIST: S C Y L L A '_' C O U N T E R '_' S H A R D '_' L I S T; 
--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -77,12 +77,14 @@ int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (tval.is_unset_value()) {
        return now;
    }
+  return with_linearized(*tval, [] (bytes_view val) {
    try {
-        data_type_for<int64_t>()->validate(*tval);
+        data_type_for<int64_t>()->validate(val);
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid timestamp value");
    }
-    return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(*tval));
+    return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(val));
+  });
 }

 int32_t attributes::get_time_to_live(const query_options& options) {
@@ -96,14 +98,16 @@ int32_t attributes::get_time_to_live(const query_options& options) {
    if (tval.is_unset_value()) {
        return 0;
    }
+  auto ttl = with_linearized(*tval, [] (bytes_view val) {
    try {
-        data_type_for<int32_t>()->validate(*tval);
+        data_type_for<int32_t>()->validate(val);
    }
    catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid TTL value");
    }

-    auto ttl = value_cast<int32_t>(data_type_for<int32_t>()->deserialize(*tval));
+    return value_cast<int32_t>(data_type_for<int32_t>()->deserialize(val));
+  });
    if (ttl < 0) {
        throw exceptions::invalid_request_exception("A TTL must be greater or equal to 0");
    }
--- a/cql3/column_identifier.cc
+++ b/cql3/column_identifier.cc
@@ -127,7 +127,11 @@ column_identifier::new_selector_factory(database& db, schema_ptr schema, std::ve
    if (!def) {
        throw exceptions::invalid_request_exception(sprint("Undefined name %s in selection clause", _text));
    }
-
+    // Do not allow explicitly selecting hidden columns. We also skip them on
+    // "SELECT *" (see selection::wildcard()).
+    if (def->is_view_virtual()) {
+        throw exceptions::invalid_request_exception(sprint("Undefined name %s in selection clause", _text));
+    }
    return selection::simple_selector::new_factory(def->name_as_text(), add_and_get_index(*def, defs), def->type);
 }

--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -225,7 +225,9 @@ public:
            } else if (value.is_unset_value()) {
                return;
            }
-            auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
+            auto increment = with_linearized(*value, [] (bytes_view value_view) {
+                return value_cast<int64_t>(long_type->deserialize_value(value_view));
+            });
            m.set_cell(prefix, column, make_counter_update_cell(increment, params));
        }
    };
@@ -240,7 +242,9 @@ public:
            } else if (value.is_unset_value()) {
                return;
            }
-            auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
+            auto increment = with_linearized(*value, [] (bytes_view value_view) {
+                return value_cast<int64_t>(long_type->deserialize_value(value_view));
+            });
            if (increment == std::numeric_limits<int64_t>::min()) {
                throw exceptions::invalid_request_exception(sprint("The negation of %d overflows supported counter precision (signed 8 bytes integer)", increment));
            }
--- a/cql3/error_collector.hh
+++ b/cql3/error_collector.hh
@@ -67,6 +67,12 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
     */
    const sstring_view _query;

+    /**
+     * An empty bitset to be used as a workaround for AntLR null dereference
+     * bug.
+     */
+    static typename ExceptionBaseType::BitsetListType _empty_bit_list;
+
 public:

    /**
@@ -144,6 +150,14 @@ private:
            break;
        }
        default:
+            // AntLR Exception class has a bug of dereferencing a null
+            // pointer in the displayRecognitionError. The following
+            // if statement makes sure it will not be null before the
+            // call to that function (displayRecognitionError).
+            // bug reference: https://github.com/antlr/antlr3/issues/191
+            if (!ex->get_expectingSet()) {
+                ex->set_expectingSet(&_empty_bit_list);
+            }
            ex->displayRecognitionError(token_names, msg);
        }
        return msg.str();
@@ -345,4 +359,8 @@ private:
 #endif
 };

+template<typename RecognizerType, typename TokenType, typename ExceptionBaseType>
+typename ExceptionBaseType::BitsetListType
+error_collector<RecognizerType,TokenType,ExceptionBaseType>::_empty_bit_list = typename ExceptionBaseType::BitsetListType();
+
 }
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -177,7 +177,7 @@ shared_ptr<function>
 make_to_json_function(data_type t) {
    return make_native_scalar_function<true>("tojson", utf8_type, {t},
            [t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
-        return utf8_type->decompose(t->to_json_string(parameters[0].value()));
+        return utf8_type->decompose(t->to_json_string(parameters[0]));
    });
 }

@@ -461,9 +461,9 @@ function_call::make_terminal(shared_ptr<function> fun, cql3::raw_value result, c
    }

    auto ctype = static_pointer_cast<const collection_type_impl>(fun->return_type());
-    bytes_view res;
+    fragmented_temporary_buffer::view res;
    if (result) {
-        res = *result;
+        res = fragmented_temporary_buffer::view(bytes_view(*result));
    }
    if (&ctype->_kind == &collection_type_impl::kind::list) {
        return make_shared(lists::value::from_serialized(std::move(res), static_pointer_cast<const list_type_impl>(ctype), sf));
--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -115,11 +115,12 @@ lists::literal::to_string() const {
 }

 lists::value
-lists::value::from_serialized(bytes_view v, list_type type, cql_serialization_format sf) {
+lists::value::from_serialized(const fragmented_temporary_buffer::view& val, list_type type, cql_serialization_format sf) {
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserializeForNativeProtocol()?!
+      return with_linearized(val, [&] (bytes_view v) {
        auto l = value_cast<list_type_impl::native_type>(type->deserialize(v, sf));
        std::vector<bytes_opt> elements;
        elements.reserve(l.size());
@@ -128,6 +129,7 @@ lists::value::from_serialized(bytes_view v, list_type type, cql_serialization_fo
            elements.push_back(element.is_null() ? bytes_opt() : bytes_opt(type->get_elements_type()->decompose(element)));
        }
        return value(std::move(elements));
+      });
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(e.what());
    }
@@ -285,7 +287,9 @@ lists::setter_by_index::execute(mutation& m, const clustering_key_prefix& prefix
        return;
    }

-    auto idx = net::ntoh(int32_t(*unaligned_cast<int32_t>(index->begin())));
+    auto idx = with_linearized(*index, [] (bytes_view v) {
+        return value_cast<int32_t>(data_type_for<int32_t>()->deserialize(v));
+    });
    auto&& existing_list_opt = params.get_prefetched_list(m.key().view(), prefix.view(), column);
    if (!existing_list_opt) {
        throw exceptions::invalid_request_exception("Attempted to set an element on a list which is null");
--- a/cql3/lists.hh
+++ b/cql3/lists.hh
@@ -79,7 +79,7 @@ public:
        explicit value(std::vector<bytes_opt> elements)
            : _elements(std::move(elements)) {
        }
-        static value from_serialized(bytes_view v, list_type type, cql_serialization_format sf);
+        static value from_serialized(const fragmented_temporary_buffer::view& v, list_type type, cql_serialization_format sf);
        virtual cql3::raw_value get(const query_options& options) override;
        virtual bytes get_with_protocol_version(cql_serialization_format sf) override;
        bool equals(shared_ptr<list_type_impl> lt, const value& v);
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -152,18 +152,20 @@ maps::literal::to_string() const {
 }

 maps::value
-maps::value::from_serialized(bytes_view value, map_type type, cql_serialization_format sf) {
+maps::value::from_serialized(const fragmented_temporary_buffer::view& fragmented_value, map_type type, cql_serialization_format sf) {
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserialize_for_native_protocol?!
+      return with_linearized(fragmented_value, [&] (bytes_view value) {
        auto m = value_cast<map_type_impl::native_type>(type->deserialize(value, sf));
        std::map<bytes, bytes, serialized_compare> map(type->get_keys_type()->as_less_comparator());
        for (auto&& e : m) {
            map.emplace(type->get_keys_type()->decompose(e.first),
                        type->get_values_type()->decompose(e.second));
        }
-        return { std::move(map) };
+        return maps::value { std::move(map) };
+      });
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(e.what());
    }
@@ -233,10 +235,10 @@ maps::delayed_value::bind(const query_options& options) {
        if (key_bytes.is_unset_value()) {
            throw exceptions::invalid_request_exception("unset value is not supported inside collections");
        }
-        if (key_bytes->size() > std::numeric_limits<uint16_t>::max()) {
+        if (key_bytes->size_bytes() > std::numeric_limits<uint16_t>::max()) {
            throw exceptions::invalid_request_exception(sprint("Map key is too long. Map keys are limited to %d bytes but %d bytes keys provided",
                                                   std::numeric_limits<uint16_t>::max(),
-                                                   key_bytes->size()));
+                                                   key_bytes->size_bytes()));
        }
        auto value_bytes = value->bind_and_get(options);
        if (value_bytes.is_null()) {
@@ -331,7 +333,7 @@ maps::do_put(mutation& m, const clustering_key_prefix& prefix, const update_para

        auto ctype = static_pointer_cast<const map_type_impl>(column.type);
        for (auto&& e : map_value->map) {
-            mut.cells.emplace_back(e.first, params.make_cell(*ctype->get_values_type(), e.second, atomic_cell::collection_member::yes));
+            mut.cells.emplace_back(e.first, params.make_cell(*ctype->get_values_type(), fragmented_temporary_buffer::view(e.second), atomic_cell::collection_member::yes));
        }
        auto col_mut = ctype->serialize_mutation_form(std::move(mut));
        m.set_cell(prefix, column, std::move(col_mut));
@@ -342,7 +344,7 @@ maps::do_put(mutation& m, const clustering_key_prefix& prefix, const update_para
        } else {
            auto v = map_type_impl::serialize_partially_deserialized_form({map_value->map.begin(), map_value->map.end()},
                    cql_serialization_format::internal());
-            m.set_cell(prefix, column, params.make_cell(*column.type, std::move(v)));
+            m.set_cell(prefix, column, params.make_cell(*column.type, fragmented_temporary_buffer::view(std::move(v))));
        }
    }
 }
--- a/cql3/maps.hh
+++ b/cql3/maps.hh
@@ -81,7 +81,7 @@ public:
        value(std::map<bytes, bytes, serialized_compare> map)
            : map(std::move(map)) {
        }
-        static value from_serialized(bytes_view value, map_type type, cql_serialization_format sf);
+        static value from_serialized(const fragmented_temporary_buffer::view& value, map_type type, cql_serialization_format sf);
        virtual cql3::raw_value get(const query_options& options) override;
        virtual bytes get_with_protocol_version(cql_serialization_format sf);
        bool equals(map_type mt, const value& v);
--- a/cql3/operation.hh
+++ b/cql3/operation.hh
@@ -92,6 +92,10 @@ public:
    }

    static atomic_cell make_cell(const abstract_type& type, bytes_view value, const update_parameters& params) {
+        return params.make_cell(type, fragmented_temporary_buffer::view(value));
+    }
+
+    static atomic_cell make_cell(const abstract_type& type, const fragmented_temporary_buffer::view& value, const update_parameters& params) {
        return params.make_cell(type, value);
    }

--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -135,79 +135,32 @@ query_options::query_options(std::vector<cql3::raw_value> values)
          db::consistency_level::ONE, infinite_timeout_config, std::move(values))
 {}

-db::consistency_level query_options::get_consistency() const
-{
-    return _consistency;
-}
-
-cql3::raw_value_view query_options::get_value_at(size_t idx) const
-{
-    return _value_views.at(idx);
-}
-
-size_t query_options::get_values_count() const
-{
-    return _value_views.size();
-}
-
 cql3::raw_value_view query_options::make_temporary(cql3::raw_value value) const
 {
    if (value) {
-        _temporaries.emplace_back(value->begin(), value->end());
-        auto& temporary = _temporaries.back();
-        return cql3::raw_value_view::make_value(bytes_view{temporary.data(), temporary.size()});
+        auto value_view = *value;
+        auto ptr = _temporaries.write_place_holder(value_view.size());
+        std::copy_n(value_view.data(), value_view.size(), ptr);
+        return cql3::raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{ptr, value_view.size()}));
    }
    return cql3::raw_value_view::make_null();
 }

-bool query_options::skip_metadata() const
+bytes_view query_options::linearize(fragmented_temporary_buffer::view view) const
 {
-    return _skip_metadata;
-}
-
-int32_t query_options::get_page_size() const
-{
-    return get_specific_options().page_size;
-}
-
-::shared_ptr<service::pager::paging_state> query_options::get_paging_state() const
-{
-    return get_specific_options().state;
-}
-
-std::experimental::optional<db::consistency_level> query_options::get_serial_consistency() const
-{
-    return get_specific_options().serial_consistency;
-}
-
-api::timestamp_type query_options::get_timestamp(service::query_state& state) const
-{
-    auto tstamp = get_specific_options().timestamp;
-    return tstamp != api::missing_timestamp ? tstamp : state.get_timestamp();
-}
-
-int query_options::get_protocol_version() const
-{
-    return _cql_serialization_format.protocol_version();
-}
-
-cql_serialization_format query_options::get_cql_serialization_format() const
-{
-    return _cql_serialization_format;
-}
-
-const query_options::specific_options& query_options::get_specific_options() const
-{
-    return _options;
-}
-
-const query_options& query_options::for_statement(size_t i) const
-{
-    if (!_batch_options) {
-        // No per-statement options supplied, so use the "global" options
-        return *this;
+    if (view.empty()) {
+        return { };
+    } else if (std::next(view.begin()) == view.end()) {
+        return *view.begin();
+    } else {
+        auto ptr = _temporaries.write_place_holder(view.size_bytes());
+        auto dst = ptr;
+        using boost::range::for_each;
+        for_each(view, [&] (bytes_view bv) {
+            dst = std::copy(bv.begin(), bv.end(), dst);
+        });
+        return bytes_view(ptr, view.size_bytes());
    }
-    return _batch_options->at(i);
 }

 void query_options::prepare(const std::vector<::shared_ptr<column_specification>>& specs)
@@ -217,29 +170,24 @@ void query_options::prepare(const std::vector<::shared_ptr<column_specification>
    }

    auto& names = *_names;
-    std::vector<cql3::raw_value> ordered_values;
+    std::vector<cql3::raw_value_view> ordered_values;
    ordered_values.reserve(specs.size());
    for (auto&& spec : specs) {
        auto& spec_name = spec->name->text();
        for (size_t j = 0; j < names.size(); j++) {
            if (names[j] == spec_name) {
-                ordered_values.emplace_back(_values[j]);
+                ordered_values.emplace_back(_value_views[j]);
                break;
            }
        }
    }
-    _values = std::move(ordered_values);
-    fill_value_views();
+    _value_views = std::move(ordered_values);
 }

 void query_options::fill_value_views()
 {
    for (auto&& value : _values) {
-        if (value) {
-            _value_views.emplace_back(cql3::raw_value_view::make_value(bytes_view{*value}));
-        } else {
-            _value_views.emplace_back(cql3::raw_value_view::make_null());
-        }
+        _value_views.emplace_back(value.to_view());
    }
 }

--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -75,7 +75,7 @@ private:
    const std::experimental::optional<std::vector<sstring_view>> _names;
    std::vector<cql3::raw_value> _values;
    std::vector<cql3::raw_value_view> _value_views;
-    mutable std::vector<std::vector<int8_t>> _temporaries;
+    mutable bytes_ostream _temporaries;
    const bool _skip_metadata;
    const specific_options _options;
    cql_serialization_format _cql_serialization_format;
@@ -156,33 +156,76 @@ public:
            std::vector<cql3::raw_value> values, specific_options options = specific_options::DEFAULT);
    explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state);

-    db::consistency_level get_consistency() const;
    const timeout_config& get_timeout_config() const { return _timeout_config; }
-    cql3::raw_value_view get_value_at(size_t idx) const;
+
+    db::consistency_level get_consistency() const {
+        return _consistency;
+    }
+
+    cql3::raw_value_view get_value_at(size_t idx) const {
+        return _value_views.at(idx);
+    }
+
+    size_t get_values_count() const {
+        return _value_views.size();
+    }
+
    cql3::raw_value_view make_temporary(cql3::raw_value value) const;
-    size_t get_values_count() const;
-    bool skip_metadata() const;
-    /**  The pageSize for this query. Will be <= 0 if not relevant for the query.  */
-    int32_t get_page_size() const;
+    bytes_view linearize(fragmented_temporary_buffer::view) const;
+
+    bool skip_metadata() const {
+        return _skip_metadata;
+    }
+
+    int32_t get_page_size() const {
+        return get_specific_options().page_size;
+    }
+
    /** The paging state for this query, or null if not relevant. */
-    ::shared_ptr<service::pager::paging_state> get_paging_state() const;
+    ::shared_ptr<service::pager::paging_state> get_paging_state() const {
+        return get_specific_options().state;
+    }
+
    /**  Serial consistency for conditional updates. */
-    std::experimental::optional<db::consistency_level> get_serial_consistency() const;
+    std::experimental::optional<db::consistency_level> get_serial_consistency() const {
+        return get_specific_options().serial_consistency;
+    }
+
+    api::timestamp_type get_timestamp(service::query_state& state) const {
+        auto tstamp = get_specific_options().timestamp;
+        return tstamp != api::missing_timestamp ? tstamp : state.get_timestamp();
+    }
+
+    /**
+     * The protocol version for the query. Will be 3 if the object don't come from
+     * a native protocol request (i.e. it's been allocated locally or by CQL-over-thrift).
+     */
+    int get_protocol_version() const {
+        return _cql_serialization_format.protocol_version();
+    }
+
+    cql_serialization_format get_cql_serialization_format() const {
+        return _cql_serialization_format;
+    }
+
+    const query_options::specific_options& get_specific_options() const {
+        return _options;
+    }
+
+    // Mainly for the sake of BatchQueryOptions
+    const query_options& for_statement(size_t i) const {
+        if (!_batch_options) {
+            // No per-statement options supplied, so use the "global" options
+            return *this;
+        }
+        return _batch_options->at(i);
+    }
+

    const std::experimental::optional<std::vector<sstring_view>>& get_names() const noexcept {
        return _names;
    }

-    api::timestamp_type get_timestamp(service::query_state& state) const;
-    /**
-     * The protocol version for the query. Will be 3 if the object don't come from
-     * a native protocol request (i.e. it's been allocated locally or by CQL-over-thrift).
-     */
-    int get_protocol_version() const;
-    cql_serialization_format get_cql_serialization_format() const;
-    // Mainly for the sake of BatchQueryOptions
-    const specific_options& get_specific_options() const;
-    const query_options& for_statement(size_t i) const;
    void prepare(const std::vector<::shared_ptr<column_specification>>& specs);
 private:
    void fill_value_views();
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -243,7 +243,17 @@ query_processor::query_processor(service::storage_proxy& proxy, distributed<data
                    sm::make_gauge(
                            "user_prepared_auth_cache_footprint",
                            [this] { return _authorized_prepared_cache.memory_footprint(); },
-                            sm::description("Size (in bytes) of the authenticated prepared statements cache."))
+                            sm::description("Size (in bytes) of the authenticated prepared statements cache.")),
+
+                    sm::make_counter(
+                            "reverse_queries",
+                            _cql_stats.reverse_queries,
+                            sm::description("Counts number of CQL SELECT requests with ORDER BY DESC.")),
+
+                    sm::make_counter(
+                            "unpaged_select_queries",
+                            _cql_stats.unpaged_select_queries,
+                            sm::description("Counts number of unpaged CQL SELECT requests.")),

            });

@@ -263,11 +273,11 @@ query_processor::process(const sstring_view& query_string, service::query_state&
    log.trace("process: \"{}\"", query_string);
    tracing::trace(query_state.get_trace_state(), "Parsing a statement");
    auto p = get_statement(query_string, query_state.get_client_state());
-    options.prepare(p->bound_names);
    auto cql_statement = p->statement;
    if (cql_statement->get_bound_terms() != options.get_values_count()) {
        throw exceptions::invalid_request_exception("Invalid amount of bind variables");
    }
+    options.prepare(p->bound_names);

    warn(unimplemented::cause::METRICS);
 #if 0
--- a/cql3/restrictions/multi_column_restriction.hh
+++ b/cql3/restrictions/multi_column_restriction.hh
@@ -45,12 +45,16 @@
 #include "cql3/statements/request_validations.hh"
 #include "cql3/restrictions/primary_key_restrictions.hh"
 #include "cql3/statements/request_validations.hh"
+#include "cql3/restrictions/single_column_primary_key_restrictions.hh"

 namespace cql3 {

 namespace restrictions {

 class multi_column_restriction : public primary_key_restrictions<clustering_key_prefix> {
+private:
+    bool _has_only_asc_columns;
+    bool _has_only_desc_columns;
 protected:
    schema_ptr _schema;
    std::vector<const column_definition*> _column_defs;
@@ -58,7 +62,9 @@ public:
    multi_column_restriction(schema_ptr schema, std::vector<const column_definition*>&& defs)
        : _schema(schema)
        , _column_defs(std::move(defs))
-    { }
+    {
+        update_asc_desc_existence();
+    }

    virtual bool is_multi_column() const override {
        return true;
@@ -84,6 +90,7 @@ public:
            "Mixing single column relations and multi column relations on clustering columns is not allowed");
        auto as_pkr = static_pointer_cast<primary_key_restrictions<clustering_key_prefix>>(other);
        do_merge_with(as_pkr);
+        update_asc_desc_existence();
    }

    bool is_satisfied_by(const schema& schema,
@@ -140,6 +147,40 @@ protected:

    virtual bool is_supported_by(const secondary_index::index& index) const = 0;

+    /**
+     * @return true if the restriction contains at least one column of each
+     * ordering, false otherwise.
+     */
+    bool is_mixed_order() const {
+        return !is_desc_order() && !is_asc_order();
+    }
+
+    /**
+     * @return true if all the restricted columns ordered in descending
+     * order, false otherwise
+     */
+    bool is_desc_order() const {
+        return _has_only_desc_columns;
+    }
+
+    /**
+     * @return true if all the restricted columns ordered in ascending
+     * order, false otherwise
+     */
+    bool is_asc_order() const {
+        return _has_only_asc_columns;
+    }
+
+private:
+    /**
+     * Updates the _has_only_asc_columns and _has_only_desc_columns fields.
+     */
+    void update_asc_desc_existence() {
+        std::size_t num_of_desc =
+                std::count_if(_column_defs.begin(), _column_defs.end(),  [] (const column_definition* cd) { return cd->type->is_reversed(); });
+        _has_only_asc_columns = num_of_desc == 0;
+        _has_only_desc_columns = num_of_desc == _column_defs.size();
+    }
 #if 0
    /**
     * Check if this type of restriction is supported for the specified column by the specified index.
@@ -385,6 +426,7 @@ protected:
 };

 class multi_column_restriction::slice final : public multi_column_restriction {
+    using restriction_shared_ptr = ::shared_ptr<primary_key_restrictions<clustering_key_prefix>>;
 private:
    term_slice _slice;

@@ -422,24 +464,11 @@ public:
    }

    virtual std::vector<bounds_range_type> bounds_ranges(const query_options& options) const override {
-        // FIXME: doesn't work properly with mixed CLUSTERING ORDER (CASSANDRA-7281)
-        auto read_bound = [&] (statements::bound b) -> std::experimental::optional<bounds_range_type::bound> {
-            if (!has_bound(b)) {
-                return {};
-            }
-            auto vals = component_bounds(b, options);
-            for (unsigned i = 0; i < vals.size(); i++) {
-                statements::request_validations::check_not_null(vals[i], "Invalid null value in condition for column %s", _column_defs.at(i)->name_as_text());
-            }
-            auto prefix = clustering_key_prefix::from_optional_exploded(*_schema, vals);
-            return bounds_range_type::bound(prefix, is_inclusive(b));
-        };
-        auto range = wrapping_range<clustering_key_prefix>(read_bound(statements::bound::START), read_bound(statements::bound::END));
-        auto bounds = bound_view::from_range(range);
-        if (bound_view::compare(*_schema)(bounds.second, bounds.first)) {
-            return { };
+        if (!is_mixed_order()) {
+            return bounds_ranges_unified_order(options);
+        } else {
+            return bounds_ranges_mixed_order(options);
        }
-        return { bounds_range_type(std::move(range)) };
    }
 #if 0
        @Override
@@ -514,6 +543,221 @@ private:
        auto value = static_pointer_cast<tuples::value>(_slice.bound(b)->bind(options));
        return value->get_elements();
    }
+
+    std::vector<bytes_opt> read_bound_components(const query_options& options, statements::bound b) const {
+        if (!has_bound(b)) {
+            return {};
+        }
+        auto vals = component_bounds(b, options);
+        for (unsigned i = 0; i < vals.size(); i++) {
+            statements::request_validations::check_not_null(vals[i], "Invalid null value in condition for column %s", _column_defs.at(i)->name_as_text());
+        }
+        return vals;
+    }
+
+    /**
+     * Retrieve the bounds for the case that all clustering columns have the same order.
+     * Having the same order implies we can do a prefix search on the data.
+     * @param options the query options
+     * @return the vector of ranges for the restriction
+     */
+    std::vector<bounds_range_type> bounds_ranges_unified_order(const query_options& options) const {
+        auto start_prefix = clustering_key_prefix::from_optional_exploded(*_schema, read_bound_components(options, statements::bound::START));
+        auto start_bound = bounds_range_type::bound(std::move(start_prefix), is_inclusive(statements::bound::START));
+        auto end_prefix = clustering_key_prefix::from_optional_exploded(*_schema, read_bound_components(options, statements::bound::END));
+        auto end_bound = bounds_range_type::bound(std::move(end_prefix), is_inclusive(statements::bound::END));
+        auto make_range = [&] () {
+            if (is_asc_order()) {
+                return bounds_range_type::make(start_bound, end_bound);
+            } else {
+                return bounds_range_type::make(end_bound, start_bound);
+            }
+        };
+        auto range = make_range();
+        auto bounds = bound_view::from_range(range);
+        if (bound_view::compare(*_schema)(bounds.second, bounds.first)) {
+            return { };
+        }
+        return { std::move(range) };
+    }
+
+    /**
+     * Retrieve the bounds when clustering columns are mixed order
+     * (contains ASC and DESC together).
+     * Having mixed order implies that a prefix search can't take place,
+     * instead, the bounds have to be broken down to separate prefix serchable
+     * ranges such that their combination is equivalent to the original range.
+     * @param options the query options
+     * @return the vector of ranges for the restriction
+     */
+    std::vector<bounds_range_type> bounds_ranges_mixed_order(const query_options& options) const {
+        std::vector<bounds_range_type> ret_ranges;
+        auto mixed_order_restrictions = build_mixed_order_restriction_set(options);
+        ret_ranges.reserve(mixed_order_restrictions.size());
+        for (auto r : mixed_order_restrictions) {
+            for (auto&& range : r->bounds_ranges(options)) {
+                ret_ranges.emplace_back(std::move(range));
+            }
+        }
+        return ret_ranges;
+    }
+
+    /**
+     * The function returns the first real inequality component.
+     * The first real inequality is the index of the first component in the
+     * tuple that will turn into a slice single column restriction.
+     * For example: (a, b, c) > (0, 1, 2) and (a, b, c) < (0, 1, 5) will be
+     * broken into one single column restriction set of the form:
+     * a = 0 and b = 1 and c > 2 and c < 5 , c is the first element that has
+     * inequality so for this case the function will return 2.
+     * @param start_components - the components of the starts tuple range.
+     * @param end_components - the components of the end tuple range.
+     * @return an empty value if not found and the index of the first index that
+     * will yield inequality
+     */
+    std::optional<std::size_t> find_first_neq_component(std::vector<bytes_opt>& start_components,
+                                                        std::vector<bytes_opt>& end_components) const {
+        size_t common_components_count = std::min(start_components.size(), end_components.size());
+        for (size_t i = 0; i < common_components_count ; i++) {
+            if (start_components[i].value() != end_components[i].value()) {
+                return i;
+            }
+        }
+
+        size_t max_components_count = std::max(start_components.size(), end_components.size());
+        if (common_components_count < max_components_count) {
+            return common_components_count;
+        } else {
+            return std::nullopt;
+        }
+    }
+
+    /**
+     * Creates a single column restriction which is either slice or equality.
+     * @param bound - if bound is empty this is an equality, if its either START or END ,
+     *        this is the corresponding slice restriction.
+     * @param inclusive - is the slice inclusive (ignored for equality).
+     * @param column_pos - the column position to restrict
+     * @param value - the value to restrict the colum with.
+     * @return a shared pointer to the just created restriction.
+     */
+    ::shared_ptr<restriction> make_single_column_restriction(std::optional<cql3::statements::bound> bound, bool inclusive,
+                                                             std::size_t column_pos,const bytes_opt& value) const {
+        ::shared_ptr<cql3::term> term = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(value));
+        if (!bound){
+            return ::make_shared<cql3::restrictions::single_column_restriction::EQ>(*_column_defs[column_pos], term);
+        } else {
+            return ::make_shared<cql3::restrictions::single_column_restriction::slice>(*_column_defs[column_pos], bound.value(), inclusive, term);
+        }
+    }
+
+    /**
+     * A helper function to create a single column restrictions set from a tuple relation on
+     * clustering keys.
+     * i.e : (a,b,c) >= (0,1,2) will become:
+     *      1.a > 0
+     *      2. a = 0 and b > 1
+     *      3. a = 0 and b = 1 and c >=2
+     * @param bound - determines if the operator is '>' (START) or '<' (END)
+     * @param bound_inclusive - determines if to append equality to the operator i.e: if > becomes >=
+     * @param bound_values - the tuple values for the restriction
+     * @param first_neq_component - the first component that will have inequality.
+     *        for the example above, if this parameter is 1, only restrictions 2 and 3 will be created.
+     *        this parameter helps to facilitate the nuances of breaking more complex relations, for example when
+     *        there is in existence a second condition limiting the other side of the bound
+     *        i.e:(a,b,c) >= (0,1,2)  and (a,b,c) < (5,6,7), this will require each bound to use the parameter.
+     * @return the single column restriction set built according to the above parameters.
+     */
+    std::vector<restriction_shared_ptr> make_single_bound_restrictions(statements::bound bound, bool bound_inclusive,
+                                                                       std::vector<bytes_opt>& bound_values,
+                                                                       std::size_t first_neq_component) const{
+        std::vector<restriction_shared_ptr> ret;
+        std::size_t num_of_restrictions = bound_values.size() - first_neq_component;
+        ret.reserve(num_of_restrictions);
+        for (std::size_t i = 0;i < num_of_restrictions ; i++) {
+            ret.emplace_back(::make_shared<cql3::restrictions::single_column_primary_key_restrictions<clustering_key>>(_schema, false));
+            std::size_t neq_component_idx = first_neq_component + i;
+            for (std::size_t j = 0;j < neq_component_idx; j++) {
+                ret[i]->merge_with(make_single_column_restriction(std::nullopt, false, j, bound_values[j]));
+            }
+            bool inclusive = (i == (num_of_restrictions-1)) && bound_inclusive;
+            ret[i]->merge_with(make_single_column_restriction(bound, inclusive, neq_component_idx, bound_values[neq_component_idx]));
+        }
+        return ret;
+    }
+
+    /**
+     * Builds and returns a set of restrictions such that the union of their ranges (the restrictions OR-ed together)
+     * is logically identical to this restriction, with the additional property that it can execute
+     * correctly when the clustering columns are with "mixed order" - contains ASC and DESC orderings.
+     * for more information: https://github.com/scylladb/scylla/issues/2050
+     * @param options - the query options
+     * @return set of restrictions which their ranges union is logically identical to this restriction.
+     */
+    std::vector<::shared_ptr<primary_key_restrictions<clustering_key_prefix>>>
+    build_mixed_order_restriction_set(const query_options& options) const {
+        std::vector<restriction_shared_ptr> ret;
+        auto start_components = read_bound_components(options, statements::bound::START);
+        auto end_components = read_bound_components(options, statements::bound::END);
+        bool start_inclusive = is_inclusive(statements::bound::START);
+        bool end_inclusive = is_inclusive(statements::bound::END);
+        std::optional<std::size_t> first_neq_component = std::nullopt;
+
+        // find the first index of the first component that is not equal between the tuples.
+        if (start_components.empty() || end_components.empty()) {
+            first_neq_component = 0;
+        } else {
+            auto tuple_mismatch = std::mismatch(start_components.begin(), start_components.end(),
+                    end_components.begin(), end_components.end());
+            if ((tuple_mismatch.first != start_components.end()) ||
+                (tuple_mismatch.second != end_components.end())) {
+                first_neq_component = std::distance(start_components.begin(), tuple_mismatch.first);
+            }
+        }
+
+        // this is either a simple equality or a never fulfilled restriction
+        if (!first_neq_component && start_inclusive && end_inclusive) {
+            // This is a simple equality case
+            shared_ptr<cql3::term> term = ::make_shared<cql3::tuples::value>(start_components);
+            ret.emplace_back(::make_shared<cql3::restrictions::multi_column_restriction::EQ>(_schema, _column_defs, term));
+            return ret;
+        } else if (!first_neq_component) {
+            // This is a contradiction case
+            return {};
+        } else if ((*first_neq_component == end_components.size() && !end_inclusive ) ||
+                   (*first_neq_component == start_components.size() && !start_inclusive )) {
+            // This is a case where one bound is a prefix of the other. If this prefix bound
+            // is not inclusive the result will be an empty set.
+            return {};
+        }
+
+        bool start_components_exists = (start_components.size() - first_neq_component.value()) > 0;
+        bool end_components_exists = (end_components.size() - first_neq_component.value()) > 0;
+        bool both_components_exists = start_components_exists && end_components_exists;
+        if (start_components_exists) {
+            auto restrictions =
+                    make_single_bound_restrictions(statements::bound::START, start_inclusive, start_components, first_neq_component.value());
+            for (auto&& r : restrictions) {
+                ret.emplace_back(r);
+            }
+        }
+
+        if (end_components_exists) {
+            auto restrictions =
+                    make_single_bound_restrictions(statements::bound::END, end_inclusive,
+                            end_components, first_neq_component.value() + both_components_exists);
+            for (auto&& r : restrictions) {
+                ret.emplace_back(r);
+            }
+        }
+
+        if (both_components_exists) {
+            bool inclusive = end_inclusive && ((end_components.size() - first_neq_component.value()) == 1);
+            ret[0]->merge_with(make_single_column_restriction(statements::bound::END, inclusive, first_neq_component.value(),
+                    end_components[first_neq_component.value()]));
+        }
+        return ret;
+    }
 };

 }
--- a/cql3/restrictions/primary_key_restrictions.hh
+++ b/cql3/restrictions/primary_key_restrictions.hh
@@ -88,6 +88,7 @@ public:

    using restrictions::uses_function;
    using restrictions::has_supporting_index;
+    using restrictions::values;

    bool empty() const override {
        return get_column_defs().empty();
@@ -99,6 +100,28 @@ public:
    bool has_unrestricted_components(const schema& schema) const;

    virtual bool needs_filtering(const schema& schema) const;
+
+    // How long a prefix of the restrictions could have resulted in
+    // need_filtering() == false. These restrictions do not need to be
+    // applied during filtering.
+    // For example, if we have the filter "c1 < 3 and c2 > 3", c1 does
+    // not need filtering (just a read stopping at c1=3) but c2 does,
+    // so num_prefix_columns_that_need_not_be_filtered() will be 1.
+    virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const {
+        return 0;
+    }
+
+    virtual bool is_all_eq() const {
+        return false;
+    }
+    virtual size_t prefix_size() const {
+        return 0;
+    }
+
+    size_t prefix_size(const schema_ptr schema) const {
+        return 0;
+    }
+
 };

 template<>
@@ -122,5 +145,23 @@ inline bool primary_key_restrictions<clustering_key>::needs_filtering(const sche
    return false;
 }

+template<>
+inline size_t primary_key_restrictions<clustering_key>::prefix_size(const schema_ptr schema) const {
+    size_t count = 0;
+    if (schema->clustering_key_columns().empty()) {
+        return count;
+    }
+    auto column_defs = get_column_defs();
+    column_id expected_column_id = schema->clustering_key_columns().begin()->id;
+    for (auto&& cdef : column_defs) {
+        if (schema->position(*cdef) != expected_column_id) {
+            return count;
+        }
+        expected_column_id++;
+        count++;
+    }
+    return count;
+}
+
 }
 }
--- a/cql3/restrictions/restrictions.hh
+++ b/cql3/restrictions/restrictions.hh
@@ -68,6 +68,10 @@ public:

    virtual std::vector<bytes_opt> values(const query_options& options) const = 0;

+    virtual bytes_opt value_for(const column_definition& cdef, const query_options& options) const {
+        throw exceptions::invalid_request_exception("Single value can be obtained from single-column restrictions only");
+    }
+
    /**
     * Returns <code>true</code> if one of the restrictions use the specified function.
     *
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -49,6 +49,7 @@
 #include <boost/algorithm/cxx11/all_of.hpp>
 #include <boost/range/adaptor/transformed.hpp>
 #include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/map.hpp>

 namespace cql3 {

@@ -62,6 +63,8 @@ class single_column_primary_key_restrictions : public primary_key_restrictions<V
    using range_type = query::range<ValueType>;
    using range_bound = typename range_type::bound;
    using bounds_range_type = typename primary_key_restrictions<ValueType>::bounds_range_type;
+    template<typename OtherValueType>
+    friend class single_column_primary_key_restrictions;
 private:
    schema_ptr _schema;
    bool _allow_filtering;
@@ -79,6 +82,27 @@ public:
        , _in(false)
    { }

+    // Convert another primary key restrictions type into this type, possibly using different schema
+    template<typename OtherValueType>
+    explicit single_column_primary_key_restrictions(schema_ptr schema, const single_column_primary_key_restrictions<OtherValueType>& other)
+        : _schema(schema)
+        , _allow_filtering(other._allow_filtering)
+        , _restrictions(::make_shared<single_column_restrictions>(schema))
+        , _slice(other._slice)
+        , _contains(other._contains)
+        , _in(other._in)
+    {
+        for (const auto& entry : other._restrictions->restrictions()) {
+            const column_definition* other_cdef = entry.first;
+            const column_definition* this_cdef = _schema->get_column_definition(other_cdef->name());
+            if (!this_cdef) {
+                throw exceptions::invalid_request_exception(sprint("Base column %s not found in view index schema", other_cdef->name_as_text()));
+            }
+            ::shared_ptr<single_column_restriction> restriction = entry.second;
+            _restrictions->add_restriction(restriction->apply_to(*this_cdef));
+        }
+    }
+
    virtual bool is_on_token() const override {
        return false;
    }
@@ -99,6 +123,10 @@ public:
        return _in;
    }

+    virtual bool is_all_eq() const override {
+        return _restrictions->is_all_eq();
+    }
+
    virtual bool has_bound(statements::bound b) const override {
        return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->has_bound(b); });
    }
@@ -137,6 +165,25 @@ public:
        _restrictions->add_restriction(restriction);
    }

+    virtual size_t prefix_size() const override {
+        return primary_key_restrictions<ValueType>::prefix_size(_schema);
+    }
+
+    ::shared_ptr<single_column_primary_key_restrictions<clustering_key>> get_longest_prefix_restrictions() {
+        static_assert(std::is_same_v<ValueType, clustering_key>, "Only clustering key can produce longest prefix restrictions");
+        size_t current_prefix_size = prefix_size();
+        if (current_prefix_size == _restrictions->restrictions().size()) {
+            return dynamic_pointer_cast<single_column_primary_key_restrictions<clustering_key>>(this->shared_from_this());
+        }
+
+        auto longest_prefix_restrictions = ::make_shared<single_column_primary_key_restrictions<clustering_key>>(_schema, _allow_filtering);
+        auto restriction_it = _restrictions->restrictions().begin();
+        for (size_t i = 0; i < current_prefix_size; ++i) {
+            longest_prefix_restrictions->merge_with((restriction_it++)->second);
+        }
+        return longest_prefix_restrictions;
+    }
+
    virtual void merge_with(::shared_ptr<restriction> restriction) override {
        if (restriction->is_multi_column()) {
            throw exceptions::invalid_request_exception(
@@ -309,6 +356,11 @@ public:
        }
        return res;
    }
+
+    virtual bytes_opt value_for(const column_definition& cdef, const query_options& options) const override {
+        return _restrictions->value_for(cdef, options);
+    }
+
    std::vector<bytes_opt> bounds(statements::bound b, const query_options& options) const override {
        // TODO: if this proved to be required.
        fail(unimplemented::cause::LEGACY_COMPOSITE_KEYS); // not 100% correct...
@@ -355,10 +407,11 @@ public:
    }

    virtual bool needs_filtering(const schema& schema) const override;
+    virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const override;
 };

 template<>
-dht::partition_range_vector
+inline dht::partition_range_vector
 single_column_primary_key_restrictions<partition_key>::bounds_ranges(const query_options& options) const {
    dht::partition_range_vector ranges;
    ranges.reserve(size());
@@ -376,7 +429,7 @@ single_column_primary_key_restrictions<partition_key>::bounds_ranges(const query
 }

 template<>
-std::vector<query::clustering_range>
+inline std::vector<query::clustering_range>
 single_column_primary_key_restrictions<clustering_key_prefix>::bounds_ranges(const query_options& options) const {
    auto wrapping_bounds = compute_bounds(options);
    auto bounds = boost::copy_range<query::clustering_row_ranges>(wrapping_bounds
@@ -413,12 +466,12 @@ single_column_primary_key_restrictions<clustering_key_prefix>::bounds_ranges(con
 }

 template<>
-bool single_column_primary_key_restrictions<partition_key>::needs_filtering(const schema& schema) const {
+inline bool single_column_primary_key_restrictions<partition_key>::needs_filtering(const schema& schema) const {
    return primary_key_restrictions<partition_key>::needs_filtering(schema);
 }

 template<>
-bool single_column_primary_key_restrictions<clustering_key>::needs_filtering(const schema& schema) const {
+inline bool single_column_primary_key_restrictions<clustering_key>::needs_filtering(const schema& schema) const {
    // Restrictions currently need filtering in three cases:
    // 1. any of them is a CONTAINS restriction
    // 2. restrictions do not form a contiguous prefix (i.e. there are gaps in it)
@@ -435,6 +488,39 @@ bool single_column_primary_key_restrictions<clustering_key>::needs_filtering(con
    return false;
 }

+// How many of the restrictions (in column order) do not need filtering
+// because they are implemented as a slice (potentially, a contiguous disk
+// read). For example, if we have the filter "c1 < 3 and c2 > 3", c1 does not
+// need filtering but c2 does so num_prefix_columns_that_need_not_be_filtered
+// will be 1.
+// The implementation of num_prefix_columns_that_need_not_be_filtered() is
+// closely tied to that of needs_filtering() above - basically, if only the
+// first num_prefix_columns_that_need_not_be_filtered() restrictions existed,
+// then needs_filtering() would have returned false.
+template<>
+inline unsigned single_column_primary_key_restrictions<clustering_key>::num_prefix_columns_that_need_not_be_filtered() const {
+    column_id position = 0;
+    unsigned int count = 0;
+    for (const auto& restriction : _restrictions->restrictions() | boost::adaptors::map_values) {
+        if (restriction->is_contains() || position != restriction->get_column_def().id) {
+            return count;
+        }
+        if (!restriction->is_slice()) {
+            position = restriction->get_column_def().id + 1;
+        }
+        count++;
+    }
+    return count;
+}
+
+template<>
+inline unsigned single_column_primary_key_restrictions<partition_key>::num_prefix_columns_that_need_not_be_filtered() const {
+    // skip_filtering() is currently called only for clustering key
+    // restrictions, so it doesn't matter what we return here.
+    return 0;
+}
+
+
 }
 }

--- a/cql3/restrictions/single_column_restriction.hh
+++ b/cql3/restrictions/single_column_restriction.hh
@@ -95,6 +95,7 @@ public:
    virtual bool is_supported_by(const secondary_index::index& index) const = 0;
    using abstract_restriction::is_satisfied_by;
    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const = 0;
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) = 0;
 #if 0
    /**
     * Check if this type of restriction is supported by the specified index.
@@ -169,6 +170,9 @@ public:
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        return ::make_shared<EQ>(cdef, _value);
+    }

 #if 0
        @Override
@@ -205,7 +209,18 @@ public:
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        throw std::logic_error("IN superclass should never be cloned directly");
+    }

+    virtual std::vector<bytes_opt> values_raw(const query_options& options) const = 0;
+
+    virtual std::vector<bytes_opt> values(const query_options& options) const override {
+        std::vector<bytes_opt> ret = values_raw(options);
+        std::sort(ret.begin(),ret.end());
+        ret.erase(std::unique(ret.begin(),ret.end()),ret.end());
+        return ret;
+    }
 #if 0
    @Override
    protected final boolean isSupportedBy(SecondaryIndex index)
@@ -228,7 +243,7 @@ public:
        return abstract_restriction::term_uses_function(_values, ks_name, function_name);
    }

-    virtual std::vector<bytes_opt> values(const query_options& options) const override {
+    virtual std::vector<bytes_opt> values_raw(const query_options& options) const override {
        std::vector<bytes_opt> ret;
        for (auto&& v : _values) {
            ret.emplace_back(to_bytes_opt(v->bind_and_get(options)));
@@ -239,6 +254,10 @@ public:
    virtual sstring to_string() const override {
        return sprint("IN(%s)", std::to_string(_values));
    }
+
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        return ::make_shared<IN_with_values>(cdef, _values);
+    }
 };

 class single_column_restriction::IN_with_marker : public IN {
@@ -253,7 +272,7 @@ public:
        return false;
    }

-    virtual std::vector<bytes_opt> values(const query_options& options) const override {
+    virtual std::vector<bytes_opt> values_raw(const query_options& options) const override {
        auto&& lval = dynamic_pointer_cast<multi_item_terminal>(_marker->bind(options));
        if (!lval) {
            throw exceptions::invalid_request_exception("Invalid null value for IN restriction");
@@ -264,6 +283,10 @@ public:
    virtual sstring to_string() const override {
        return "IN ?";
    }
+
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        return ::make_shared<IN_with_marker>(cdef, _marker);
+    }
 };

 class single_column_restriction::slice : public single_column_restriction {
@@ -275,6 +298,11 @@ public:
        , _slice(term_slice::new_instance(bound, inclusive, std::move(term)))
    { }

+    slice(const column_definition& column_def, term_slice slice)
+        : single_column_restriction(column_def)
+        , _slice(slice)
+    { }
+
    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
        return (_slice.has_bound(statements::bound::START) && abstract_restriction::term_uses_function(_slice.bound(statements::bound::START), ks_name, function_name))
                || (_slice.has_bound(statements::bound::END) && abstract_restriction::term_uses_function(_slice.bound(statements::bound::END), ks_name, function_name));
@@ -361,6 +389,9 @@ public:
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        return ::make_shared<slice>(cdef, _slice);
+    }
 };

 // This holds CONTAINS, CONTAINS_KEY, and map[key] = value restrictions because we might want to have any combination of them.
@@ -483,6 +514,9 @@ public:
                                 const query_options& options,
                                 gc_clock::time_point now) const override;
    virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
+    virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
+        throw std::logic_error("Cloning 'contains' restriction is not implemented.");
+    }

 #if 0
        private List<ByteBuffer> keys(const query_options& options) {
--- a/cql3/restrictions/single_column_restrictions.hh
+++ b/cql3/restrictions/single_column_restrictions.hh
@@ -111,6 +111,11 @@ public:
        return r;
    }

+    virtual bytes_opt value_for(const column_definition& cdef, const query_options& options) const override {
+        auto it = _restrictions.find(std::addressof(cdef));
+        return (it != _restrictions.end()) ? it->second->value(options) : bytes_opt{};
+    }
+
    /**
     * Returns the restriction associated to the specified column.
     *
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -72,6 +72,9 @@ public:
        // throw? should not reach?
        return {};
    }
+    bytes_opt value_for(const column_definition& cdef, const query_options& options) const override {
+        return {};
+    }
    std::vector<T> values_as_keys(const query_options& options) const override {
        // throw? should not reach?
        return {};
@@ -212,12 +215,13 @@ statement_restrictions::statement_restrictions(database& db,
    auto& cf = db.find_column_family(schema);
    auto& sim = cf.get_index_manager();
    bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim);
+    bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim);
    bool has_queriable_index = has_queriable_clustering_column_index
-            || _partition_key_restrictions->has_supporting_index(sim)
+            || has_queriable_pk_index
            || _nonprimary_key_restrictions->has_supporting_index(sim);

    // At this point, the select statement if fully constructed, but we still have a few things to validate
-    process_partition_key_restrictions(has_queriable_index, for_view, allow_filtering);
+    process_partition_key_restrictions(has_queriable_pk_index, for_view, allow_filtering);

    // Some but not all of the partition key columns have been specified;
    // hence we need turn these restrictions into index expressions.
@@ -237,7 +241,7 @@ statement_restrictions::statement_restrictions(database& db,
        }
    }

-    process_clustering_columns_restrictions(has_queriable_index, select_a_collection, for_view, allow_filtering);
+    process_clustering_columns_restrictions(has_queriable_clustering_column_index, select_a_collection, for_view, allow_filtering);

    // Covers indexes on the first clustering column (among others).
    if (_is_key_range && has_queriable_clustering_column_index) {
@@ -333,6 +337,52 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
    return _index_restrictions;
 }

+std::optional<secondary_index::index> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
+    for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
+        for (const auto& cdef : restriction->get_column_defs()) {
+            for (auto index : sim.list_indexes()) {
+                if (index.depends_on(*cdef)) {
+                    return std::make_optional<secondary_index::index>(std::move(index));
+                }
+            }
+        }
+    }
+    return std::nullopt;
+}
+
+std::vector<const column_definition*> statement_restrictions::get_column_defs_for_filtering(database& db) const {
+    std::vector<const column_definition*> column_defs_for_filtering;
+    if (need_filtering()) {
+        auto& sim = db.find_column_family(_schema).get_index_manager();
+        std::optional<secondary_index::index> opt_idx = find_idx(sim);
+        auto column_uses_indexing = [&opt_idx] (const column_definition* cdef) {
+            return opt_idx && opt_idx->depends_on(*cdef);
+        };
+        if (_partition_key_restrictions->needs_filtering(*_schema)) {
+            for (auto&& cdef : _partition_key_restrictions->get_column_defs()) {
+                if (!column_uses_indexing(cdef)) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
+            column_id first_filtering_id = _schema->clustering_key_columns().begin()->id +
+                    _clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
+            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
+                if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef)) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        for (auto&& cdef : _nonprimary_key_restrictions->get_column_defs()) {
+            if (!column_uses_indexing(cdef)) {
+                column_defs_for_filtering.emplace_back(cdef);
+            }
+        }
+    }
+    return column_defs_for_filtering;
+}
+
 void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
    // If there is a queriable index, no special condition are required on the other restrictions.
    // But we still need to know 2 things:
@@ -413,18 +463,28 @@ std::vector<query::clustering_range> statement_restrictions::get_clustering_boun
    if (_clustering_columns_restrictions->empty()) {
        return {query::clustering_range::make_open_ended_both_sides()};
    }
-    // TODO(sarna): For filtering to work, clustering range is not bounded at all. For filtering to work faster,
-    // the biggest clustering prefix restriction should be used here.
    if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
+        if (auto single_ck_restrictions = dynamic_pointer_cast<single_column_primary_key_restrictions<clustering_key>>(_clustering_columns_restrictions)) {
+            return single_ck_restrictions->get_longest_prefix_restrictions()->bounds_ranges(options);
+        }
        return {query::clustering_range::make_open_ended_both_sides()};
    }
    return _clustering_columns_restrictions->bounds_ranges(options);
 }

 bool statement_restrictions::need_filtering() const {
-    uint32_t number_of_restricted_columns = 0;
+    uint32_t number_of_restricted_columns_for_indexing = 0;
    for (auto&& restrictions : _index_restrictions) {
-        number_of_restricted_columns += restrictions->size();
+        number_of_restricted_columns_for_indexing += restrictions->size();
+    }
+
+    int number_of_filtering_restrictions = _nonprimary_key_restrictions->size();
+    // If the whole partition key is restricted, it does not imply filtering
+    if (_partition_key_restrictions->has_unrestricted_components(*_schema) || !_partition_key_restrictions->is_all_eq()) {
+        number_of_filtering_restrictions += _partition_key_restrictions->size();
+        if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
+            number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
+        }
    }

    if (_partition_key_restrictions->is_multi_column() || _clustering_columns_restrictions->is_multi_column()) {
@@ -433,10 +493,11 @@ bool statement_restrictions::need_filtering() const {
        return false;
    }

-    return number_of_restricted_columns > 1
-            || (number_of_restricted_columns == 0 && _partition_key_restrictions->empty() && !_clustering_columns_restrictions->empty())
-            || (number_of_restricted_columns != 0 && _nonprimary_key_restrictions->has_multiple_contains())
-            || (number_of_restricted_columns != 0 && !_uses_secondary_indexing);
+    return number_of_restricted_columns_for_indexing > 1
+            || (number_of_restricted_columns_for_indexing == 0 && _partition_key_restrictions->empty() && !_clustering_columns_restrictions->empty())
+            || (number_of_restricted_columns_for_indexing != 0 && _nonprimary_key_restrictions->has_multiple_contains())
+            || (number_of_restricted_columns_for_indexing != 0 && !_uses_secondary_indexing)
+            || (_uses_secondary_indexing && number_of_filtering_restrictions > 1);
 }

 void statement_restrictions::validate_secondary_index_selections(bool selects_only_static_columns) {
@@ -582,7 +643,8 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
        if (!value) {
            return { };
        }
-        return { range_type::bound(*value, slice.is_inclusive(bound)) };
+        auto value_view = options.linearize(*value);
+        return { range_type::bound(value_view, slice.is_inclusive(bound)) };
    };
    return range_type(
        extract_bound(statements::bound::START),
@@ -611,7 +673,7 @@ bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const qu
    if (_column_def.type->is_counter()) {
        fail(unimplemented::cause::COUNTERS);
    }
-    return to_range(_slice, options).contains(data, _column_def.type->as_tri_comparator());
+    return to_range(_slice, options).contains(data, _column_def.type->underlying_type()->as_tri_comparator());
 }

 bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
@@ -647,10 +709,12 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
            if (!val) {
                continue;
            }
-            auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
+            auto found = with_linearized(*val, [&] (bytes_view bv) {
+              return std::find_if(elements.begin(), end, [&] (auto&& element) {
                return element.second.value().with_linearized([&] (bytes_view value_bv) {
-                    return element_type->compare(value_bv, *val) == 0;
+                    return element_type->compare(value_bv, bv) == 0;
                });
+              });
            });
            if (found == end) {
                return false;
@@ -661,8 +725,10 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
            if (!k) {
                continue;
            }
-            auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
-                return map_key_type->compare(element.first, *k) == 0;
+            auto found = with_linearized(*k, [&] (bytes_view bv) {
+              return std::find_if(elements.begin(), end, [&] (auto&& element) {
+                return map_key_type->compare(element.first, bv) == 0;
+              });
            });
            if (found == end) {
                return false;
@@ -674,14 +740,18 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
            if (!map_key || !map_value) {
                continue;
            }
-            auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
-                return map_key_type->compare(element.first, *map_key) == 0;
+            auto found = with_linearized(*map_key, [&] (bytes_view map_key_bv) {
+              return std::find_if(elements.begin(), end, [&] (auto&& element) {
+                return map_key_type->compare(element.first, map_key_bv) == 0;
+              });
            });
            if (found == end) {
                return false;
            }
-            auto cmp = found->second.value().with_linearized([&] (bytes_view value_bv) {
-                return element_type->compare(value_bv, *map_value);
+            auto cmp = with_linearized(*map_value, [&] (bytes_view map_value_bv) {
+              return found->second.value().with_linearized([&] (bytes_view value_bv) {
+                return element_type->compare(value_bv, map_value_bv);
+              });
            });
            if (cmp != 0) {
                return false;
@@ -698,13 +768,14 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
            return _column_def.type->deserialize(cell_value_bv);
        });
        for (auto&& value : _values) {
-            auto val = value->bind_and_get(options);
-            if (!val) {
+            auto fragmented_val = value->bind_and_get(options);
+            if (!fragmented_val) {
                continue;
            }
+          return with_linearized(*fragmented_val, [&] (bytes_view val) {
            auto exists_in = [&](auto&& range) {
                auto found = std::find_if(range.begin(), range.end(), [&] (auto&& element) {
-                    return element_type->compare(element.serialize(), *val) == 0;
+                    return element_type->compare(element.serialize(), val) == 0;
                });
                return found != range.end();
            };
@@ -722,6 +793,8 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
                    return false;
                }
            }
+            return true;
+          });
        }
        if (col_type->is_map()) {
            auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
@@ -730,8 +803,10 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
                if (!k) {
                    continue;
                }
-                auto found = std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
-                    return map_key_type->compare(element.first.serialize(), *k) == 0;
+                auto found = with_linearized(*k, [&] (bytes_view k_bv) {
+                  return std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
+                    return map_key_type->compare(element.first.serialize(), k_bv) == 0;
+                  });
                });
                if (found == data_map.end()) {
                    return false;
@@ -743,10 +818,15 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
                if (!map_key || !map_value) {
                    continue;
                }
-                auto found = std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
-                    return map_key_type->compare(element.first.serialize(), *map_key) == 0;
+                auto found = with_linearized(*map_key, [&] (bytes_view map_key_bv) {
+                  return std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
+                    return map_key_type->compare(element.first.serialize(), map_key_bv) == 0;
+                  });
                });
-                if (found == data_map.end() || element_type->compare(found->second.serialize(), *map_value) != 0) {
+                if (found == data_map.end()
+                    || with_linearized(*map_value, [&] (bytes_view map_value_bv) {
+                         return element_type->compare(found->second.serialize(), map_value_bv);
+                       }) != 0) {
                    return false;
                }
            }
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -163,6 +163,20 @@ public:
        return _clustering_columns_restrictions;
    }

+    /**
+     * Builds a possibly empty collection of column definitions that will be used for filtering
+     * @param db - the database context
+     * @return A list with the column definitions needed for filtering.
+     */
+    std::vector<const column_definition*> get_column_defs_for_filtering(database& db) const;
+
+    /**
+     * Determines the index to be used with the restriction.
+     * @param db - the database context (for extracting index manager)
+     * @return If an index can be used, an optional containing this index, otherwise an empty optional.
+     */
+    std::optional<secondary_index::index> find_idx(secondary_index::secondary_index_manager& sim) const;
+
    /**
     * Checks if the partition key has some unrestricted components.
     * @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -45,27 +45,25 @@ namespace cql3 {

 metadata::metadata(std::vector<::shared_ptr<column_specification>> names_)
        : _flags(flag_enum_set())
-        , names(std::move(names_)) {
-    _column_count = names.size();
-}
+        , _column_info(make_lw_shared<column_info>(std::move(names_)))
+{ }

 metadata::metadata(flag_enum_set flags, std::vector<::shared_ptr<column_specification>> names_, uint32_t column_count,
        ::shared_ptr<const service::pager::paging_state> paging_state)
    : _flags(flags)
-    , names(std::move(names_))
-    , _column_count(column_count)
+    , _column_info(make_lw_shared<column_info>(std::move(names_), column_count))
    , _paging_state(std::move(paging_state))
 { }

 // The maximum number of values that the ResultSet can hold. This can be bigger than columnCount due to CASSANDRA-4911
 uint32_t metadata::value_count() const {
-    return _flags.contains<flag::NO_METADATA>() ? _column_count : names.size();
+    return _flags.contains<flag::NO_METADATA>() ? _column_info->_column_count : _column_info->_names.size();
 }

 void metadata::add_non_serialized_column(::shared_ptr<column_specification> name) {
    // See comment above. Because columnCount doesn't account the newly added name, it
    // won't be serialized.
-    names.emplace_back(std::move(name));
+    _column_info->_names.emplace_back(std::move(name));
 }

 bool metadata::all_in_same_cf() const {
@@ -73,18 +71,21 @@ bool metadata::all_in_same_cf() const {
        return false;
    }

-    return column_specification::all_in_same_table(names);
+    return column_specification::all_in_same_table(_column_info->_names);
 }

-void metadata::set_has_more_pages(::shared_ptr<const service::pager::paging_state> paging_state) {
-    if (!paging_state) {
-        return;
-    }
-
+void metadata::set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state) {
    _flags.set<flag::HAS_MORE_PAGES>();
    _paging_state = std::move(paging_state);
 }

+void metadata::maybe_set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state) {
+    assert(paging_state);
+    if (paging_state->get_remaining() > 0) {
+        set_paging_state(std::move(paging_state));
+    }
+}
+
 void metadata::set_skip_metadata() {
    _flags.set<flag::NO_METADATA>();
 }
@@ -93,18 +94,10 @@ metadata::flag_enum_set metadata::flags() const {
    return _flags;
 }

-uint32_t metadata::column_count() const {
-    return _column_count;
-}
-
 ::shared_ptr<const service::pager::paging_state> metadata::paging_state() const {
    return _paging_state;
 }

-const std::vector<::shared_ptr<column_specification>>& metadata::get_names() const {
-    return names;
-}
-
 prepared_metadata::prepared_metadata(const std::vector<::shared_ptr<column_specification>>& names,
                                     const std::vector<uint16_t>& partition_key_bind_indices)
    : _names{names}
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -70,18 +70,29 @@ public:

    using flag_enum_set = enum_set<flag_enum>;

-private:
-    flag_enum_set _flags;
-
-public:
+    struct column_info {
    // Please note that columnCount can actually be smaller than names, even if names is not null. This is
    // used to include columns in the resultSet that we need to do post-query re-orderings
    // (SelectStatement.orderResults) but that shouldn't be sent to the user as they haven't been requested
    // (CASSANDRA-4911). So the serialization code will exclude any columns in name whose index is >= columnCount.
-    std::vector<::shared_ptr<column_specification>> names;
+        std::vector<::shared_ptr<column_specification>> _names;
+        uint32_t _column_count;
+
+        column_info(std::vector<::shared_ptr<column_specification>> names, uint32_t column_count)
+            : _names(std::move(names))
+            , _column_count(column_count)
+        { }
+
+        explicit column_info(std::vector<::shared_ptr<column_specification>> names)
+            : _names(std::move(names))
+            , _column_count(_names.size())
+        { }
+    };
+private:
+    flag_enum_set _flags;

 private:
-    uint32_t _column_count;
+    lw_shared_ptr<column_info> _column_info;
    ::shared_ptr<const service::pager::paging_state> _paging_state;

 public:
@@ -99,17 +110,20 @@ private:
    bool all_in_same_cf() const;

 public:
-    void set_has_more_pages(::shared_ptr<const service::pager::paging_state> paging_state);
+    void set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state);
+    void maybe_set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state);

    void set_skip_metadata();

    flag_enum_set flags() const;

-    uint32_t column_count() const;
+    uint32_t column_count() const { return _column_info->_column_count; }

    ::shared_ptr<const service::pager::paging_state> paging_state() const;

-    const std::vector<::shared_ptr<column_specification>>& get_names() const;
+    const std::vector<::shared_ptr<column_specification>>& get_names() const {
+        return _column_info->_names;
+    }
 };

 ::shared_ptr<const cql3::metadata> make_empty_metadata();
@@ -223,14 +237,14 @@ public:
 class result {
    std::unique_ptr<cql3::result_set> _result_set;
    result_generator _result_generator;
-    shared_ptr<cql3::metadata> _metadata;
+    shared_ptr<const cql3::metadata> _metadata;
 public:
    explicit result(std::unique_ptr<cql3::result_set> rs)
        : _result_set(std::move(rs))
        , _metadata(_result_set->_metadata)
    { }

-    explicit result(result_generator generator, shared_ptr<metadata> m)
+    explicit result(result_generator generator, shared_ptr<const metadata> m)
        : _result_generator(std::move(generator))
        , _metadata(std::move(m))
    { }
@@ -240,7 +254,7 @@ public:
        if (_result_set) {
            return *_result_set;
        } else {
-            auto builder = result_set::builder(_metadata);
+            auto builder = result_set::builder(make_shared<cql3::metadata>(*_metadata));
            _result_generator.visit(builder);
            return std::move(builder).get_result_set();
        }
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -40,6 +40,7 @@
 */

 #include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/adaptor/filtered.hpp>

 #include "cql3/selection/selection.hh"
 #include "cql3/selection/selector_factories.hh"
@@ -155,9 +156,9 @@ public:
        return _factories->uses_function(ks_name, function_name);
    }

-    virtual uint32_t add_column_for_ordering(const column_definition& c) override {
-        uint32_t index = selection::add_column_for_ordering(c);
-        _factories->add_selector_for_ordering(c, index);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c) override {
+        uint32_t index = selection::add_column_for_post_processing(c);
+        _factories->add_selector_for_post_processing(c, index);
        return index;
    }

@@ -208,9 +209,17 @@ protected:

 ::shared_ptr<selection> selection::wildcard(schema_ptr schema) {
    auto columns = schema->all_columns_in_select_order();
-    auto cds = boost::copy_range<std::vector<const column_definition*>>(columns | boost::adaptors::transformed([](const column_definition& c) {
-        return &c;
-    }));
+    // filter out hidden columns, which should not be seen by the
+    // user when doing "SELECT *". We also disallow selecting them
+    // individually (see column_identifier::new_selector_factory()).
+    auto cds = boost::copy_range<std::vector<const column_definition*>>(
+        columns |
+        boost::adaptors::filtered([](const column_definition& c) {
+            return !c.is_view_virtual();
+        }) |
+        boost::adaptors::transformed([](const column_definition& c) {
+            return &c;
+        }));
    return simple_selection::make(schema, std::move(cds), true);
 }

@@ -218,7 +227,7 @@ protected:
    return simple_selection::make(schema, std::move(columns), false);
 }

-uint32_t selection::add_column_for_ordering(const column_definition& c) {
+uint32_t selection::add_column_for_post_processing(const column_definition& c) {
    _columns.push_back(&c);
    _metadata->add_non_serialized_column(c.column_specification);
    return _columns.size() - 1;
@@ -330,14 +339,14 @@ std::unique_ptr<result_set> result_set_builder::build() {
    return std::move(_result_set);
 }

-bool result_set_builder::restrictions_filter::operator()(const selection& selection,
+bool result_set_builder::restrictions_filter::do_filter(const selection& selection,
                                                         const std::vector<bytes>& partition_key,
                                                         const std::vector<bytes>& clustering_key,
                                                         const query::result_row_view& static_row,
                                                         const query::result_row_view& row) const {
    static logging::logger rlogger("restrictions_filter");

-    if (_current_pratition_key_does_not_match || _current_static_row_does_not_match) {
+    if (_current_partition_key_does_not_match || _current_static_row_does_not_match || _remaining == 0) {
        return false;
    }

@@ -350,11 +359,16 @@ bool result_set_builder::restrictions_filter::operator()(const selection& select
        switch (cdef->kind) {
        case column_kind::static_column:
            // fallthrough
-        case column_kind::regular_column:
+        case column_kind::regular_column: {
+            auto& cell_iterator = (cdef->kind == column_kind::static_column) ? static_row_iterator : row_iterator;
            if (cdef->type->is_multi_cell()) {
-                rlogger.debug("Multi-cell filtering is not implemented yet", cdef->name_as_text());
+                cell_iterator.next_collection_cell();
+                auto restr_it = non_pk_restrictions_map.find(cdef);
+                if (restr_it == non_pk_restrictions_map.end()) {
+                    continue;
+                }
+                throw exceptions::invalid_request_exception("Collection filtering is not supported yet");
            } else {
-                auto cell_iterator = (cdef->kind == column_kind::static_column) ? static_row_iterator : row_iterator;
                auto cell = cell_iterator.next_atomic_cell();

                auto restr_it = non_pk_restrictions_map.find(cdef);
@@ -365,17 +379,18 @@ bool result_set_builder::restrictions_filter::operator()(const selection& select

                bool regular_restriction_matches;
                if (cell) {
-                    regular_restriction_matches = cell->value().with_linearized([&restriction](bytes_view data) {
-                        return restriction.is_satisfied_by(data, cql3::query_options({ }));
+                    regular_restriction_matches = cell->value().with_linearized([&restriction, this](bytes_view data) {
+                        return restriction.is_satisfied_by(data, _options);
                    });
                } else {
-                    regular_restriction_matches = restriction.is_satisfied_by(bytes(), cql3::query_options({ }));
+                    regular_restriction_matches = restriction.is_satisfied_by(bytes(), _options);
                }
                if (!regular_restriction_matches) {
                    _current_static_row_does_not_match = (cdef->kind == column_kind::static_column);
                    return false;
                }

+            }
            }
            break;
        case column_kind::partition_key: {
@@ -385,9 +400,9 @@ bool result_set_builder::restrictions_filter::operator()(const selection& select
            }
            restrictions::single_column_restriction& restriction = *restr_it->second;
            const bytes& value_to_check = partition_key[cdef->id];
-            bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, cql3::query_options({ }));
+            bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, _options);
            if (!pk_restriction_matches) {
-                _current_pratition_key_does_not_match = true;
+                _current_partition_key_does_not_match = true;
                return false;
            }
            }
@@ -399,7 +414,7 @@ bool result_set_builder::restrictions_filter::operator()(const selection& select
            }
            restrictions::single_column_restriction& restriction = *restr_it->second;
            const bytes& value_to_check = clustering_key[cdef->id];
-            bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, cql3::query_options({ }));
+            bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, _options);
            if (!pk_restriction_matches) {
                return false;
            }
@@ -412,6 +427,20 @@ bool result_set_builder::restrictions_filter::operator()(const selection& select
    return true;
 }

+bool result_set_builder::restrictions_filter::operator()(const selection& selection,
+                                                         const std::vector<bytes>& partition_key,
+                                                         const std::vector<bytes>& clustering_key,
+                                                         const query::result_row_view& static_row,
+                                                         const query::result_row_view& row) const {
+    const bool accepted = do_filter(selection, partition_key, clustering_key, static_row, row);
+    if (!accepted) {
+        ++_rows_dropped;
+    } else if (_remaining > 0) {
+        --_remaining;
+    }
+    return accepted;
+}
+
 api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
    return _timestamps[idx];
 }
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -169,10 +169,14 @@ public:
        return _metadata;
    }

+    ::shared_ptr<metadata> get_result_metadata() {
+        return _metadata;
+    }
+
    static ::shared_ptr<selection> wildcard(schema_ptr schema);
    static ::shared_ptr<selection> for_columns(schema_ptr schema, std::vector<const column_definition*> columns);

-    virtual uint32_t add_column_for_ordering(const column_definition& c);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c);

    virtual bool uses_function(const sstring &ks_name, const sstring& function_name) const {
        return false;
@@ -255,19 +259,31 @@ public:
        }
        void reset() {
        }
+        uint32_t get_rows_dropped() const {
+            return 0;
+        }
    };
    class restrictions_filter {
        ::shared_ptr<restrictions::statement_restrictions> _restrictions;
-        mutable bool _current_pratition_key_does_not_match = false;
+        const query_options& _options;
+        mutable bool _current_partition_key_does_not_match = false;
        mutable bool _current_static_row_does_not_match = false;
+        mutable uint32_t _rows_dropped = 0;
+        mutable uint32_t _remaining = 0;
    public:
        restrictions_filter() = default;
-        explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions) : _restrictions(restrictions) {}
+        explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options, uint32_t remaining) : _restrictions(restrictions), _options(options), _remaining(remaining) {}
        bool operator()(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
        void reset() {
-            _current_pratition_key_does_not_match = false;
+            _current_partition_key_does_not_match = false;
            _current_static_row_does_not_match = false;
+            _rows_dropped = 0;
        }
+        uint32_t get_rows_dropped() const {
+            return _rows_dropped;
+        }
+    private:
+        bool do_filter(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
    };

    result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
@@ -367,7 +383,7 @@ public:
            }
        }

-        void accept_partition_end(const query::result_row_view& static_row) {
+        uint32_t accept_partition_end(const query::result_row_view& static_row) {
            if (_row_count == 0) {
                _builder.new_row();
                auto static_row_iterator = static_row.iterator();
@@ -381,6 +397,7 @@ public:
                    }
                }
            }
+            return _filter.get_rows_dropped();
        }
    };

--- a/cql3/selection/selector.hh
+++ b/cql3/selection/selector.hh
@@ -105,9 +105,11 @@ public:
    virtual void reset() = 0;

    virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) override {
-        if (receiver->type == get_type()) {
+        auto t1 = receiver->type->underlying_type();
+        auto t2 = get_type()->underlying_type();
+        if (t1 == t2) {
            return assignment_testable::test_result::EXACT_MATCH;
-        } else if (receiver->type->is_value_compatible_with(*get_type())) {
+        } else if (t1->is_value_compatible_with(*t2)) {
            return assignment_testable::test_result::WEAKLY_ASSIGNABLE;
        } else {
            return assignment_testable::test_result::NOT_ASSIGNABLE;
--- a/cql3/selection/selector_factories.cc
+++ b/cql3/selection/selector_factories.cc
@@ -53,6 +53,7 @@ selector_factories::selector_factories(std::vector<::shared_ptr<selectable>> sel
    : _contains_write_time_factory(false)
    , _contains_ttl_factory(false)
    , _number_of_aggregate_factories(0)
+    , _number_of_factories_for_post_processing(0)
 {
    _factories.reserve(selectables.size());

@@ -76,8 +77,9 @@ bool selector_factories::uses_function(const sstring& ks_name, const sstring& fu
    return false;
 }

-void selector_factories::add_selector_for_ordering(const column_definition& def, uint32_t index) {
+void selector_factories::add_selector_for_post_processing(const column_definition& def, uint32_t index) {
    _factories.emplace_back(simple_selector::new_factory(def.name_as_text(), index, def.type));
+    ++_number_of_factories_for_post_processing;
 }

 std::vector<::shared_ptr<selector>> selector_factories::new_instances() const {
--- a/cql3/selection/selector_factories.hh
+++ b/cql3/selection/selector_factories.hh
@@ -74,6 +74,11 @@ private:
     */
    uint32_t _number_of_aggregate_factories;

+    /**
+     * The number of factories that are only for post processing.
+     */
+    uint32_t _number_of_factories_for_post_processing;
+
 public:
    /**
     * Creates a new <code>SelectorFactories</code> instance and collect the column definitions.
@@ -97,11 +102,12 @@ public:
    bool uses_function(const sstring& ks_name, const sstring& function_name) const;

    /**
-     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY purposes.
+     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY or post
+     * processing purposes.
     * @param def the column that is needed for ordering
     * @param index the index of the column definition in the Selection's list of columns
     */
-    void add_selector_for_ordering(const column_definition& def, uint32_t index);
+    void add_selector_for_post_processing(const column_definition& def, uint32_t index);

    /**
     * Checks if this <code>SelectorFactories</code> contains only factories for aggregates.
@@ -111,7 +117,7 @@ public:
     */
    bool contains_only_aggregate_functions() const {
        auto size = _factories.size();
-        return size != 0 && _number_of_aggregate_factories == size;
+        return size != 0 && _number_of_aggregate_factories  == (size - _number_of_factories_for_post_processing);
    }

    /**
--- a/cql3/sets.cc
+++ b/cql3/sets.cc
@@ -120,17 +120,19 @@ sets::literal::to_string() const {
 }

 sets::value
-sets::value::from_serialized(bytes_view v, set_type type, cql_serialization_format sf) {
+sets::value::from_serialized(const fragmented_temporary_buffer::view& val, set_type type, cql_serialization_format sf) {
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserializeForNativeProtocol?!
+      return with_linearized(val, [&] (bytes_view v) {
        auto s = value_cast<set_type_impl::native_type>(type->deserialize(v, sf));
        std::set<bytes, serialized_compare> elements(type->get_elements_type()->as_less_comparator());
        for (auto&& element : s) {
            elements.insert(elements.end(), type->get_elements_type()->decompose(element));
        }
        return value(std::move(elements));
+      });
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(e.what());
    }
@@ -198,10 +200,10 @@ sets::delayed_value::bind(const query_options& options) {
            return constants::UNSET_VALUE;
        }
        // We don't support value > 64K because the serialization format encode the length as an unsigned short.
-        if (b->size() > std::numeric_limits<uint16_t>::max()) {
+        if (b->size_bytes() > std::numeric_limits<uint16_t>::max()) {
            throw exceptions::invalid_request_exception(sprint("Set value is too long. Set values are limited to %d bytes but %d bytes value provided",
                    std::numeric_limits<uint16_t>::max(),
-                    b->size()));
+                    b->size_bytes()));
        }

        buffers.insert(buffers.end(), std::move(to_bytes(*b)));
@@ -269,7 +271,7 @@ sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const upd
        }

        for (auto&& e : set_value->_elements) {
-            mut.cells.emplace_back(e, params.make_cell(*set_type->value_comparator(), {}, atomic_cell::collection_member::yes));
+            mut.cells.emplace_back(e, params.make_cell(*set_type->value_comparator(), bytes_view(), atomic_cell::collection_member::yes));
        }
        auto smut = set_type->serialize_mutation_form(mut);

@@ -279,7 +281,7 @@ sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const upd
        auto v = set_type->serialize_partially_deserialized_form(
                {set_value->_elements.begin(), set_value->_elements.end()},
                cql_serialization_format::internal());
-        m.set_cell(row_key, column, params.make_cell(*column.type, std::move(v)));
+        m.set_cell(row_key, column, params.make_cell(*column.type, fragmented_temporary_buffer::view(v)));
    } else {
        m.set_cell(row_key, column, params.make_dead_cell());
    }
--- a/cql3/sets.hh
+++ b/cql3/sets.hh
@@ -78,7 +78,7 @@ public:
        value(std::set<bytes, serialized_compare> elements)
                : _elements(std::move(elements)) {
        }
-        static value from_serialized(bytes_view v, set_type type, cql_serialization_format sf);
+        static value from_serialized(const fragmented_temporary_buffer::view& v, set_type type, cql_serialization_format sf);
        virtual cql3::raw_value get(const query_options& options) override;
        virtual bytes get_with_protocol_version(cql_serialization_format sf) override;
        bool equals(set_type st, const value& v);
--- a/cql3/single_column_relation.cc
+++ b/cql3/single_column_relation.cc
@@ -101,13 +101,6 @@ single_column_relation::to_receivers(schema_ptr schema, const column_definition&
    }

    if (is_IN()) {
-        // For partition keys we only support IN for the last name so far
-        if (column_def.is_partition_key() && !schema->is_last_partition_key(column_def)) {
-            throw exceptions::invalid_request_exception(sprint(
-                "Partition KEY part %s cannot be restricted by IN relation (only the last part of the partition key can)",
-                column_def.name_as_text()));
-        }
-
        // We only allow IN on the row key and the clustering key so far, never on non-PK columns, and this even if
        // there's an index
        // Note: for backward compatibility reason, we conside a IN of 1 value the same as a EQ, so we let that
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -246,18 +246,22 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a

        cfm.with_column(column_name->name(), type, _is_static ? column_kind::static_column : column_kind::regular_column);

-        // Adding a column to a table which has an include all view requires the column to be added to the view
-        // as well. If the view has a regular base column in its PK, then the column ID needs to be updated in
-        // view_info; for that, rebuild the schema.
+        // Adding a column to a base table always requires updating the view
+        // schemas: If the view includes all columns it should include the new
+        // column, but if it doesn't, it may need to include the new
+        // unselected column as a virtual column. The case when it we
+        // shouldn't add a virtual column is when the view has in its PK one
+        // of the base's regular columns - but even in this case we need to
+        // rebuild the view schema, to update the column ID.
        if (!_is_static) {
            for (auto&& view : cf.views()) {
-                if (view->view_info()->include_all_columns() || view->view_info()->base_non_pk_column_in_view_pk()) {
-                    schema_builder builder(view);
-                    if (view->view_info()->include_all_columns()) {
-                        builder.with_column(column_name->name(), type);
-                    }
-                    view_updates.push_back(view_ptr(builder.build()));
+                schema_builder builder(view);
+                if (view->view_info()->include_all_columns()) {
+                    builder.with_column(column_name->name(), type);
+                } else if (!view->view_info()->base_non_pk_column_in_view_pk()) {
+                    db::view::create_virtual_column(builder, column_name->name(), type);
                }
+                view_updates.push_back(view_ptr(builder.build()));
            }
        }

@@ -272,7 +276,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a

        auto type = validate_alter(schema, *def, *validator);
        // In any case, we update the column definition
-        cfm.with_altered_column_type(column_name->name(), type);
+        cfm.alter_column_type(column_name->name(), type);

        // We also have to validate the view types here. If we have a view which includes a column as part of
        // the clustering key, we need to make sure that it is indeed compatible.
@@ -281,7 +285,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
            if (view_def) {
                schema_builder builder(view);
                auto view_type = validate_alter(view, *view_def, *validator);
-                builder.with_altered_column_type(column_name->name(), std::move(view_type));
+                builder.alter_column_type(column_name->name(), std::move(view_type));
                view_updates.push_back(view_ptr(builder.build()));
            }
        }
@@ -302,7 +306,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
        } else {
            for (auto&& column_def : boost::range::join(schema->static_columns(), schema->regular_columns())) { // find
                if (column_def.name() == column_name->name()) {
-                    cfm.without_column(column_name->name());
+                    cfm.remove_column(column_name->name());
                    break;
                }
            }
@@ -345,9 +349,10 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
            auto to = entry.second->prepare_column_identifier(schema);

            validate_column_rename(db, *schema, *from, *to);
-            cfm.with_column_rename(from->name(), to->name());
+            cfm.rename_column(from->name(), to->name());

-            // If the view includes a renamed column, it must be renamed in the view table and the definition.
+            // If the view includes a renamed column, it must be renamed in
+            // the view table and the definition.
            for (auto&& view : cf.views()) {
                if (view->get_column_definition(from->name())) {
                    schema_builder builder(view);
@@ -355,7 +360,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
                    auto view_from = entry.first->prepare_column_identifier(view);
                    auto view_to = entry.second->prepare_column_identifier(view);
                    validate_column_rename(db, *view, *view_from, *view_to);
-                    builder.with_column_rename(view_from->name(), view_to->name());
+                    builder.rename_column(view_from->name(), view_to->name());

                    auto new_where = util::rename_column_in_where_clause(
                            view->view_info()->where_clause(),
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -110,7 +110,7 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b
            if (t_opt) {
                modified = true;
                // We need to update this column
-                cfm.with_altered_column_type(column.name(), *t_opt);
+                cfm.alter_column_type(column.name(), *t_opt);
            }
        }
        if (modified) {
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -191,20 +191,20 @@ const std::vector<batch_statement::single_statement>& batch_statement::get_state
    return _statements;
 }

-future<std::vector<mutation>> batch_statement::get_mutations(service::storage_proxy& storage, const query_options& options, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state) {
+future<std::vector<mutation>> batch_statement::get_mutations(service::storage_proxy& storage, const query_options& options, db::timeout_clock::time_point timeout, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state) {
    // Do not process in parallel because operations like list append/prepend depend on execution order.
    using mutation_set_type = std::unordered_set<mutation, mutation_hash_by_key, mutation_equals_by_key>;
-    return do_with(mutation_set_type(), [this, &storage, &options, now, local, trace_state] (auto& result) {
+    return do_with(mutation_set_type(), [this, &storage, &options, timeout, now, local, trace_state] (auto& result) {
        result.reserve(_statements.size());
        _stats.statements_in_batches += _statements.size();
        return do_for_each(boost::make_counting_iterator<size_t>(0),
                           boost::make_counting_iterator<size_t>(_statements.size()),
-                           [this, &storage, &options, now, local, &result, trace_state] (size_t i) {
+                           [this, &storage, &options, now, local, &result, timeout, trace_state] (size_t i) {
            auto&& statement = _statements[i].statement;
            statement->inc_cql_stats();
            auto&& statement_options = options.for_statement(i);
            auto timestamp = _attrs->get_timestamp(now, statement_options);
-            return statement->get_mutations(storage, statement_options, local, timestamp, trace_state).then([&result] (auto&& more) {
+            return statement->get_mutations(storage, statement_options, timeout, local, timestamp, trace_state).then([&result] (auto&& more) {
                for (auto&& m : more) {
                    // We want unordered_set::try_emplace(), but we don't have it
                    auto pos = result.find(m);
@@ -293,8 +293,9 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
        return execute_with_conditions(storage, options, query_state);
    }

-    return get_mutations(storage, options, local, now, query_state.get_trace_state()).then([this, &storage, &options, tr_state = query_state.get_trace_state()] (std::vector<mutation> ms) mutable {
-        return execute_without_conditions(storage, std::move(ms), options.get_consistency(), std::move(tr_state));
+    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    return get_mutations(storage, options, timeout, local, now, query_state.get_trace_state()).then([this, &storage, &options, timeout, tr_state = query_state.get_trace_state()] (std::vector<mutation> ms) mutable {
+        return execute_without_conditions(storage, std::move(ms), options.get_consistency(), timeout, std::move(tr_state));
    }).then([] {
        return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
                make_shared<cql_transport::messages::result_message::void_message>());
@@ -305,6 +306,7 @@ future<> batch_statement::execute_without_conditions(
        service::storage_proxy& storage,
        std::vector<mutation> mutations,
        db::consistency_level cl,
+        db::timeout_clock::time_point timeout,
        tracing::trace_state_ptr tr_state)
 {
    // FIXME: do we need to do this?
@@ -332,7 +334,7 @@ future<> batch_statement::execute_without_conditions(
            mutate_atomic = false;
        }
    }
-    return storage.mutate_with_triggers(std::move(mutations), cl, mutate_atomic, std::move(tr_state));
+    return storage.mutate_with_triggers(std::move(mutations), cl, timeout, mutate_atomic, std::move(tr_state));
 }

 future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute_with_conditions(
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -125,7 +125,7 @@ public:

    const std::vector<single_statement>& get_statements();
 private:
-    future<std::vector<mutation>> get_mutations(service::storage_proxy& storage, const query_options& options, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state);
+    future<std::vector<mutation>> get_mutations(service::storage_proxy& storage, const query_options& options, db::timeout_clock::time_point timeout, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state);

 public:
    /**
@@ -147,6 +147,7 @@ private:
            service::storage_proxy& storage,
            std::vector<mutation> mutations,
            db::consistency_level cl,
+            db::timeout_clock::time_point timeout,
            tracing::trace_state_ptr tr_state);

    future<shared_ptr<cql_transport::messages::result_message>> execute_with_conditions(
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -88,6 +88,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
        throw exceptions::invalid_request_exception("Secondary indexes are not supported on materialized views");
    }

+    if (schema->is_dense()) {
+        throw exceptions::invalid_request_exception(
+                "Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns");
+    }
+
    std::vector<::shared_ptr<index_target>> targets;
    for (auto& raw_target : _raw_targets) {
        targets.emplace_back(raw_target->prepare(schema));
@@ -109,6 +114,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
                    sprint("No column definition found for column %s", *target->column));
        }

+        //NOTICE(sarna): Should be lifted after resolving issue #2963
+        if (cd->is_static()) {
+            throw exceptions::invalid_request_exception("Indexing static columns is not implemented yet.");
+        }
+
        if (cd->type->references_duration()) {
            using request_validations::check_false;
            const auto& ty = *cd->type;
@@ -122,8 +132,7 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
        }

        // Origin TODO: we could lift that limitation
-        if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) &&
-            cd->kind != column_kind::regular_column) {
+        if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) && cd->is_primary_key()) {
            throw exceptions::invalid_request_exception(
                    "Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables");
        }
@@ -137,10 +146,15 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c

        bool is_map = dynamic_cast<const collection_type_impl *>(cd->type.get()) != nullptr
                      && dynamic_cast<const collection_type_impl *>(cd->type.get())->is_map();
-        bool is_frozen_collection = cd->type->is_collection() && !cd->type->is_multi_cell();
+        bool is_collection = cd->type->is_collection();
+        bool is_frozen_collection = is_collection && !cd->type->is_multi_cell();

        if (is_frozen_collection) {
            validate_for_frozen_collection(target);
+        } else if (is_collection) {
+            // NOTICE(sarna): should be lifted after #2962 (indexes on non-frozen collections) is implemented
+            throw exceptions::invalid_request_exception(
+                    sprint("Cannot create secondary index on non-frozen collection column %s", cd->name_as_text()));
        } else {
            validate_not_full_index(target);
            validate_is_values_index_if_target_column_not_collection(cd, target);
--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -84,7 +84,6 @@ create_view_statement::create_view_statement(
    , _clustering_keys{clustering_keys}
    , _if_not_exists{if_not_exists}
 {
-    service::get_local_storage_proxy().get_db().local().get_config().check_experimental("Creating materialized views");
    if (!service::get_local_storage_service().cluster_supports_materialized_views()) {
        throw exceptions::invalid_request_exception("Can't create materialized views until the whole cluster has been upgraded");
    }
@@ -275,6 +274,7 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a

    std::vector<const column_definition*> missing_pk_columns;
    std::vector<const column_definition*> target_non_pk_columns;
+    std::vector<const column_definition*> unselected_columns;

    // We need to include all of the primary key columns from the base table in order to make sure that we do not
    // overwrite values in the view. We cannot support "collapsing" the base table into a smaller number of rows in
@@ -292,6 +292,9 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
        if (included_def && !def_in_target_pk) {
            target_non_pk_columns.push_back(&def);
        }
+        if (!included_def && !def_in_target_pk && !def.is_static()) {
+            unselected_columns.push_back(&def);
+        }
        if (def.is_primary_key() && !def_in_target_pk) {
            missing_pk_columns.push_back(&def);
        }
@@ -311,6 +314,27 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
        throw exceptions::invalid_request_exception(sprint("No columns are defined for Materialized View other than primary key"));
    }

+    // The unique feature of a filter by a non-key column is that the
+    // value of such column can be updated - and also be expired with TTL
+    // and cause the view row to appear and disappear. We don't currently
+    // support support this case - see issue #3430, and neither does
+    // Cassandra - see see CASSANDRA-13798 and CASSANDRA-13832.
+    // Actually, as CASSANDRA-13798 explains, the problem is "the liveness of
+    // view row is now depending on multiple base columns (multiple filtered
+    // non-pk base column + base column used in view pk)". When the filtered
+    // column *is* the base column added to the view pk, we don't have this
+    // problem. And this case actually works correctly.
+    auto non_pk_restrictions = restrictions->get_non_pk_restriction();
+    if (non_pk_restrictions.size() == 1 && has_non_pk_column &&
+            std::find(target_primary_keys.begin(), target_primary_keys.end(), non_pk_restrictions.cbegin()->first) != target_primary_keys.end()) {
+        // This case (filter by new PK column of the view) works, as explained above
+    } else if (!non_pk_restrictions.empty()) {
+        auto column_names = ::join(", ", non_pk_restrictions | boost::adaptors::map_keys | boost::adaptors::transformed(std::mem_fn(&column_definition::name_as_text)));
+        throw exceptions::invalid_request_exception(sprint(
+                "Non-primary key columns cannot be restricted in the SELECT statement used for materialized view %s creation (got restrictions on: %s)",
+                column_family(), column_names));
+    }
+
    schema_builder builder{keyspace(), column_family()};
    auto add_columns = [this, &builder] (std::vector<const column_definition*>& defs, column_kind kind) mutable {
        for (auto* def : defs) {
@@ -321,6 +345,19 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
    add_columns(target_partition_keys, column_kind::partition_key);
    add_columns(target_clustering_keys, column_kind::clustering_key);
    add_columns(target_non_pk_columns, column_kind::regular_column);
+    // Add all unselected columns (base-table columns which are not selected
+    // in the view) as "virtual columns" - columns which have timestamp and
+    // ttl information, but an empty value. These are needed to keep view
+    // rows alive when the base row is alive, even if the view row has no
+    // data, just a key (see issue #3362). The virtual columns are not needed
+    // when the view pk adds a regular base column (i.e., has_non_pk_column)
+    // because in that case, the liveness of that base column is what
+    // determines the liveness of the view row.
+    if (!has_non_pk_column) {
+        for (auto* def : unselected_columns) {
+            db::view::create_virtual_column(builder, def->name(), def->type);
+        }
+    }
    _properties.properties()->apply_to_builder(builder, proxy.get_db().local().get_config().extensions());

    if (builder.default_time_to_live().count() > 0) {
--- a/cql3/statements/index_prop_defs.cc
+++ b/cql3/statements/index_prop_defs.cc
@@ -49,7 +49,7 @@ void cql3::statements::index_prop_defs::validate() {
    property_definitions::validate(keywords);

    if (is_custom && !custom_class) {
-        throw exceptions::invalid_request_exception("CUSTOM index requires specifiying the index class");
+        throw exceptions::invalid_request_exception("CUSTOM index requires specifying the index class");
    }

    if (!is_custom && custom_class) {
@@ -64,6 +64,16 @@ void cql3::statements::index_prop_defs::validate() {
                sprint("Cannot specify %s as a CUSTOM option",
                        db::index::secondary_index::custom_index_option_name));
    }
+
+    // Currently, Scylla does not support *any* class of custom index
+    // implementation. If in the future we do (e.g., SASI, or something
+    // new), we'll need to check for valid values here.
+    if (is_custom && custom_class) {
+        throw exceptions::invalid_request_exception(
+                format("Unsupported CUSTOM INDEX class {}. Note that currently, Scylla does not support SASI or any other CUSTOM INDEX class.",
+                        *custom_class));
+
+    }
 }

 index_options_map
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -160,11 +160,11 @@ future<> modification_statement::check_access(const service::client_state& state
 }

 future<std::vector<mutation>>
-modification_statement::get_mutations(service::storage_proxy& proxy, const query_options& options, bool local, int64_t now, tracing::trace_state_ptr trace_state) {
+modification_statement::get_mutations(service::storage_proxy& proxy, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, tracing::trace_state_ptr trace_state) {
    auto json_cache = maybe_prepare_json_cache(options);
    auto keys = make_lw_shared(build_partition_keys(options, json_cache));
    auto ranges = make_lw_shared(create_clustering_ranges(options, json_cache));
-    return make_update_parameters(proxy, keys, ranges, options, local, now, std::move(trace_state)).then(
+    return make_update_parameters(proxy, keys, ranges, options, timeout, local, now, std::move(trace_state)).then(
            [this, keys, ranges, now, json_cache = std::move(json_cache)] (auto params_ptr) {
                std::vector<mutation> mutations;
                mutations.reserve(keys->size());
@@ -186,10 +186,11 @@ modification_statement::make_update_parameters(
        lw_shared_ptr<dht::partition_range_vector> keys,
        lw_shared_ptr<query::clustering_row_ranges> ranges,
        const query_options& options,
+        db::timeout_clock::time_point timeout,
        bool local,
        int64_t now,
        tracing::trace_state_ptr trace_state) {
-    return read_required_rows(proxy, *keys, std::move(ranges), local, options, std::move(trace_state)).then(
+    return read_required_rows(proxy, *keys, std::move(ranges), local, options, timeout, std::move(trace_state)).then(
            [this, &options, now] (auto rows) {
                return make_ready_future<std::unique_ptr<update_parameters>>(
                        std::make_unique<update_parameters>(s, options,
@@ -275,6 +276,7 @@ modification_statement::read_required_rows(
        lw_shared_ptr<query::clustering_row_ranges> ranges,
        bool local,
        const query_options& options,
+        db::timeout_clock::time_point timeout,
        tracing::trace_state_ptr trace_state) {
    if (!requires_read()) {
        return make_ready_future<update_parameters::prefetched_rows_type>(
@@ -308,7 +310,6 @@ modification_statement::read_required_rows(
                query::partition_slice::option::collections_as_maps>());
    query::read_command cmd(s->id(), s->version(), ps, std::numeric_limits<uint32_t>::max());
    // FIXME: ignoring "local"
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
    return proxy.query(s, make_lw_shared(std::move(cmd)), std::move(keys),
            cl, {timeout, std::move(trace_state)}).then([this, ps] (auto qr) {
        return query::result_view::do_with(*qr.query_result, [&] (query::result_view v) {
@@ -408,12 +409,13 @@ modification_statement::execute_without_condition(service::storage_proxy& proxy,
        db::validate_for_write(s->ks_name(), cl);
    }

-    return get_mutations(proxy, options, false, options.get_timestamp(qs), qs.get_trace_state()).then([this, cl, &proxy, &qs] (auto mutations) {
+    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    return get_mutations(proxy, options, timeout, false, options.get_timestamp(qs), qs.get_trace_state()).then([this, cl, timeout, &proxy, &qs] (auto mutations) {
        if (mutations.empty()) {
            return now();
        }

-        return proxy.mutate_with_triggers(std::move(mutations), cl, false, qs.get_trace_state(), this->is_raw_counter_shard_write());
+        return proxy.mutate_with_triggers(std::move(mutations), cl, timeout, false, qs.get_trace_state(), this->is_raw_counter_shard_write());
    });
 }

--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -206,6 +206,7 @@ protected:
                lw_shared_ptr<query::clustering_row_ranges> ranges,
                bool local,
                const query_options& options,
+                db::timeout_clock::time_point now,
                tracing::trace_state_ptr trace_state);
 private:
    future<::shared_ptr<cql_transport::messages::result_message>>
@@ -349,7 +350,7 @@ public:
     * @return vector of the mutations
     * @throws invalid_request_exception on invalid requests
     */
-    future<std::vector<mutation>> get_mutations(service::storage_proxy& proxy, const query_options& options, bool local, int64_t now, tracing::trace_state_ptr trace_state);
+    future<std::vector<mutation>> get_mutations(service::storage_proxy& proxy, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, tracing::trace_state_ptr trace_state);

 public:
    future<std::unique_ptr<update_parameters>> make_update_parameters(
@@ -357,6 +358,7 @@ public:
                lw_shared_ptr<dht::partition_range_vector> keys,
                lw_shared_ptr<query::clustering_row_ranges> ranges,
                const query_options& options,
+                db::timeout_clock::time_point timeout,
                bool local,
                int64_t now,
                tracing::trace_state_ptr trace_state);
--- a/cql3/statements/raw/insert_statement.hh
+++ b/cql3/statements/raw/insert_statement.hh
@@ -87,6 +87,7 @@ private:
    ::shared_ptr<attributes::raw> _attrs;
    ::shared_ptr<term::raw> _json_value;
    bool _if_not_exists;
+    bool _default_unset;
 public:
    /**
     * A parsed <code>INSERT JSON</code> statement.
@@ -95,7 +96,7 @@ public:
     * @param json_value JSON string representing names and values
     * @param attrs additional attributes for statement (CL, timestamp, timeToLive)
     */
-    insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists);
+    insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists, bool default_unset);

    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
                ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats) override;
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -141,6 +141,10 @@ private:
    /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
    void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);

+    void ensure_filtering_columns_retrieval(database& db,
+                                            ::shared_ptr<selection::selection> selection,
+                                            ::shared_ptr<restrictions::statement_restrictions> restrictions);
+
    bool contains_alias(::shared_ptr<column_identifier> name);

    ::shared_ptr<column_specification> limit_receiver();
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -45,6 +45,7 @@
 #include "transport/messages/result_message.hh"
 #include "cql3/selection/selection.hh"
 #include "cql3/util.hh"
+#include "cql3/restrictions/single_column_primary_key_restrictions.hh"
 #include "core/shared_ptr.hh"
 #include "query-result-reader.hh"
 #include "query_result_merger.hh"
@@ -96,12 +97,8 @@ public:
                encoded_row.write("\\\"", 2);
            }
            encoded_row.write("\": ", 3);
-            if (parameters[i]) {
-                sstring row_sstring = _selector_types[i]->to_json_string(parameters[i].value());
-                encoded_row.write(row_sstring.c_str(), row_sstring.size());
-            } else {
-                encoded_row.write("null", 4);
-            }
+            sstring row_sstring = _selector_types[i]->to_json_string(parameters[i]);
+            encoded_row.write(row_sstring.c_str(), row_sstring.size());
        }
        encoded_row.write("}", 1);
        return encoded_row.linearize().to_string();
@@ -316,13 +313,14 @@ select_statement::make_partition_slice(const query_options& options)
    if (_is_reversed) {
        _opts.set(query::partition_slice::option::reversed);
        std::reverse(bounds.begin(), bounds.end());
+        ++_stats.reverse_queries;
    }
    return query::partition_slice(std::move(bounds),
        std::move(static_columns), std::move(regular_columns), _opts, nullptr, options.get_cql_serialization_format());
 }

 int32_t select_statement::get_limit(const query_options& options) const {
-    if (!_limit) {
+    if (!_limit || _selection->is_aggregate()) {
        return std::numeric_limits<int32_t>::max();
    }

@@ -333,9 +331,10 @@ int32_t select_statement::get_limit(const query_options& options) const {
    if (val.is_unset_value()) {
        return std::numeric_limits<int32_t>::max();
    }
+  return with_linearized(*val, [&] (bytes_view bv) {
    try {
-        int32_type->validate(*val);
-        auto l = value_cast<int32_t>(int32_type->deserialize(*val));
+        int32_type->validate(bv);
+        auto l = value_cast<int32_t>(int32_type->deserialize(bv));
        if (l <= 0) {
            throw exceptions::invalid_request_exception("LIMIT must be strictly positive");
        }
@@ -343,6 +342,7 @@ int32_t select_statement::get_limit(const query_options& options) const {
    } catch (const marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid limit value");
    }
+  });
 }

 bool select_statement::needs_post_query_ordering() const {
@@ -383,48 +383,55 @@ select_statement::do_execute(service::storage_proxy& proxy,
    int32_t limit = get_limit(options);
    auto now = gc_clock::now();

+    const bool restrictions_need_filtering = _restrictions->need_filtering();
    ++_stats.reads;
-    _stats.filtered_reads += _restrictions->need_filtering();
+    _stats.filtered_reads += restrictions_need_filtering;

    auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
        make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));

    int32_t page_size = options.get_page_size();

+    _stats.unpaged_select_queries += page_size <= 0;
+
    // An aggregation query will never be paged for the user, but we always page it internally to avoid OOM.
    // If we user provided a page_size we'll use that to page internally (because why not), otherwise we use our default
    // Note that if there are some nodes in the cluster with a version less than 2.0, we can't use paging (CASSANDRA-6707).
-    auto aggregate = _selection->is_aggregate();
-    if (aggregate && page_size <= 0) {
+    const bool aggregate = _selection->is_aggregate();
+    const bool nonpaged_filtering = restrictions_need_filtering && page_size <= 0;
+    if (aggregate || nonpaged_filtering) {
        page_size = DEFAULT_COUNT_PAGE_SIZE;
    }

    auto key_ranges = _restrictions->get_partition_key_ranges(options);

-    if (!aggregate && (page_size <= 0
-            || !service::pager::query_pagers::may_need_paging(page_size,
+    if (!aggregate && !restrictions_need_filtering && (page_size <= 0
+            || !service::pager::query_pagers::may_need_paging(*_schema, page_size,
                    *command, key_ranges))) {
        return execute(proxy, command, std::move(key_ranges), state, options, now);
    }

    command->slice.options.set<query::partition_slice::option::allow_short_read>();
-    auto timeout = options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout_duration = options.get_timeout_config().*get_timeout_config_selector();
    auto p = service::pager::query_pagers::pager(_schema, _selection,
-            state, options, timeout, command, std::move(key_ranges), _stats, _restrictions->need_filtering() ? _restrictions : nullptr);
+            state, options, command, std::move(key_ranges), _stats, restrictions_need_filtering ? _restrictions : nullptr);

-    if (aggregate) {
+    if (aggregate || nonpaged_filtering) {
        return do_with(
                cql3::selection::result_set_builder(*_selection, now,
                        options.get_cql_serialization_format()),
-                [this, p, page_size, now](auto& builder) {
+                [this, p, page_size, now, timeout_duration, restrictions_need_filtering](auto& builder) {
                    return do_until([p] {return p->is_exhausted();},
-                            [p, &builder, page_size, now] {
-                                return p->fetch_page(builder, page_size, now);
+                            [p, &builder, page_size, now, timeout_duration] {
+                                auto timeout = db::timeout_clock::now() + timeout_duration;
+                                return p->fetch_page(builder, page_size, now, timeout);
                            }
-                    ).then([this, &builder] {
+                    ).then([this, &builder, restrictions_need_filtering] {
                                auto rs = builder.build();
+                                if (restrictions_need_filtering) {
+                                    _stats.filtered_rows_matched_total += rs->size();
+                                }
                                update_stats_rows_read(rs->size());
-                                _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
                                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
                            });
@@ -437,12 +444,18 @@ select_statement::do_execute(service::storage_proxy& proxy,
                        " you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
    }

-    if (_selection->is_trivial() && !_restrictions->need_filtering()) {
-        return p->fetch_page_generator(page_size, now, _stats).then([this, p, limit] (result_generator generator) {
-            auto meta = make_shared<metadata>(*_selection->get_result_metadata());
-            if (!p->is_exhausted()) {
-                meta->set_has_more_pages(p->state());
-            }
+    auto timeout = db::timeout_clock::now() + timeout_duration;
+    if (_selection->is_trivial() && !restrictions_need_filtering) {
+        return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p, limit] (result_generator generator) {
+            auto meta = [&] () -> shared_ptr<const cql3::metadata> {
+                if (!p->is_exhausted()) {
+                    auto meta = make_shared<metadata>(*_selection->get_result_metadata());
+                    meta->set_paging_state(p->state());
+                    return meta;
+                } else {
+                    return _selection->get_result_metadata();
+                }
+            }();

            return shared_ptr<cql_transport::messages::result_message>(
                make_shared<cql_transport::messages::result_message::rows>(result(std::move(generator), std::move(meta)))
@@ -450,20 +463,201 @@ select_statement::do_execute(service::storage_proxy& proxy,
        });
    }

-    return p->fetch_page(page_size, now).then(
-            [this, p, &options, limit, now](std::unique_ptr<cql3::result_set> rs) {
+    return p->fetch_page(page_size, now, timeout).then(
+            [this, p, &options, now, restrictions_need_filtering](std::unique_ptr<cql3::result_set> rs) {

                if (!p->is_exhausted()) {
-                    rs->get_metadata().set_has_more_pages(p->state());
+                    rs->get_metadata().set_paging_state(p->state());
                }

+                if (restrictions_need_filtering) {
+                    _stats.filtered_rows_matched_total += rs->size();
+                }
                update_stats_rows_read(rs->size());
-                _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
            });
 }

+template<typename KeyType>
+GCC6_CONCEPT(
+    requires (std::is_same_v<KeyType, partition_key> || std::is_same_v<KeyType, clustering_key_prefix>)
+)
+static KeyType
+generate_base_key_from_index_pk(const partition_key& index_pk, const clustering_key& index_ck, const schema& base_schema, const schema& view_schema) {
+    const auto& base_columns = std::is_same_v<KeyType, partition_key> ? base_schema.partition_key_columns() : base_schema.clustering_key_columns();
+    std::vector<bytes_view> exploded_base_key;
+    exploded_base_key.reserve(base_columns.size());
+
+    for (const column_definition& base_col : base_columns) {
+        const column_definition* view_col = view_schema.view_info()->view_column(base_col);
+        if (view_col->is_partition_key()) {
+            exploded_base_key.push_back(index_pk.get_component(view_schema, view_col->id));
+        } else {
+            exploded_base_key.push_back(index_ck.get_component(view_schema, view_col->id));
+        }
+    }
+    return KeyType::from_range(exploded_base_key);
+}
+
+lw_shared_ptr<query::read_command>
+indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) {
+    lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
+            _schema->id(),
+            _schema->version(),
+            make_partition_slice(options),
+            get_limit(options),
+            now,
+            tracing::make_trace_info(state.get_trace_state()),
+            query::max_partitions,
+            utils::UUID(),
+            options.get_timestamp(state));
+    if (use_paging) {
+        cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
+        cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
+        if (_schema->clustering_key_size() > 0) {
+            cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
+        }
+    }
+    return cmd;
+}
+
+future<shared_ptr<cql_transport::messages::result_message>>
+indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        dht::partition_range_vector&& partition_ranges,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
+    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    dht::partition_range_vector per_vnode_ranges;
+    per_vnode_ranges.reserve(partition_ranges.size());
+    for (auto& pr : partition_ranges) {
+        auto restricted_ranges = proxy.get_restricted_ranges(*_schema, pr);
+        std::move(restricted_ranges.begin(), restricted_ranges.end(), std::back_inserter(per_vnode_ranges));
+    }
+
+    struct base_query_state {
+        query::result_merger merger;
+        dht::partition_range_vector per_vnode_ranges;
+        dht::partition_range_vector::iterator current_partition_range;
+        base_query_state(uint32_t row_limit, dht::partition_range_vector&& ranges)
+                : merger(row_limit * ranges.size(), query::max_partitions)
+                , per_vnode_ranges(std::move(ranges))
+                , current_partition_range(per_vnode_ranges.begin())
+                {}
+        base_query_state(base_query_state&&) = default;
+        base_query_state(const base_query_state&) = delete;
+    };
+
+    base_query_state query_state{cmd->row_limit, std::move(per_vnode_ranges)};
+    return do_with(std::move(query_state), [this, &proxy, &state, &options, cmd, timeout] (auto&& query_state) {
+        auto &merger = query_state.merger;
+        auto &ranges = query_state.per_vnode_ranges;
+        auto &range_it = query_state.current_partition_range;
+        return repeat([this, &ranges, &range_it, &merger, &proxy, &state, &options, cmd, timeout]() {
+            // Starting with 1 range, we check if the result was a short read, and if not,
+            // we continue exponentially, asking for 2x more ranges than before
+            auto range_it_end = std::min(range_it + std::distance(ranges.begin(), range_it) + 1, ranges.end());
+            dht::partition_range_vector prange(range_it, range_it_end);
+            auto command = ::make_lw_shared<query::read_command>(*cmd);
+            auto old_paging_state = options.get_paging_state();
+            if (old_paging_state && range_it == ranges.begin()) {
+                auto base_pk = generate_base_key_from_index_pk<partition_key>(old_paging_state->get_partition_key(),
+                        *old_paging_state->get_clustering_key(), *_schema, *_view_schema);
+                auto base_ck = generate_base_key_from_index_pk<clustering_key>(old_paging_state->get_partition_key(),
+                        *old_paging_state->get_clustering_key(), *_schema, *_view_schema);
+                command->slice.set_range(*_schema, base_pk,
+                        std::vector<query::clustering_range>{query::clustering_range::make_starting_with(range_bound<clustering_key>(base_ck, false))});
+            }
+            return proxy.query(_schema, command, std::move(prange), options.get_consistency(), {timeout, state.get_trace_state()})
+            .then([&range_it, range_it_end = std::move(range_it_end), &ranges, &merger] (service::storage_proxy::coordinator_query_result qr) {
+                bool is_short_read = qr.query_result->is_short_read();
+                merger(std::move(qr.query_result));
+                range_it = range_it_end;
+                return stop_iteration(is_short_read || range_it == ranges.end());
+            });
+        }).then([&merger]() {
+            return merger.get();
+        });
+    }).then([this, &proxy, &state, &options, now, cmd, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
+        return this->process_base_query_results(std::move(result), cmd, proxy, state, options, now, std::move(paging_state));
+    });
+}
+
+// Function for fetching the selected columns from a list of clustering rows.
+// It is currently used only in our Secondary Index implementation - ordinary
+// CQL SELECT statements do not have the syntax to request a list of rows.
+// FIXME: The current implementation is very inefficient - it requests each
+// row separately (and, incrementally, in parallel). Even multiple rows from a single
+// partition are requested separately. This last case can be easily improved,
+// but to implement the general case (multiple rows from multiple partitions)
+// efficiently, we will need more support from other layers.
+// Keys are ordered in token order (see #3423)
+future<shared_ptr<cql_transport::messages::result_message>>
+indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        std::vector<primary_key>&& primary_keys,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
+    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+
+    struct base_query_state {
+        query::result_merger merger;
+        std::vector<primary_key> primary_keys;
+        std::vector<primary_key>::iterator current_primary_key;
+        base_query_state(uint32_t row_limit, std::vector<primary_key>&& keys)
+                : merger(row_limit, query::max_partitions)
+                , primary_keys(std::move(keys))
+                , current_primary_key(primary_keys.begin())
+                {}
+        base_query_state(base_query_state&&) = default;
+        base_query_state(const base_query_state&) = delete;
+    };
+
+    base_query_state query_state{cmd->row_limit, std::move(primary_keys)};
+    return do_with(std::move(query_state), [this, &proxy, &state, &options, cmd, timeout] (auto&& query_state) {
+        auto &merger = query_state.merger;
+        auto &keys = query_state.primary_keys;
+        auto &key_it = query_state.current_primary_key;
+        return repeat([this, &keys, &key_it, &merger, &proxy, &state, &options, cmd, timeout]() {
+            // Starting with 1 key, we check if the result was a short read, and if not,
+            // we continue exponentially, asking for 2x more key than before
+            auto key_it_end = std::min(key_it + std::distance(keys.begin(), key_it) + 1, keys.end());
+            auto command = ::make_lw_shared<query::read_command>(*cmd);
+
+            query::result_merger oneshot_merger(cmd->row_limit, query::max_partitions);
+            return map_reduce(key_it, key_it_end, [this, &proxy, &state, &options, cmd, timeout] (auto& key) {
+                auto command = ::make_lw_shared<query::read_command>(*cmd);
+                // for each partition, read just one clustering row (TODO: can
+                // get all needed rows of one partition at once.)
+                command->slice._row_ranges.clear();
+                if (key.clustering) {
+                    command->slice._row_ranges.push_back(query::clustering_range::make_singular(key.clustering));
+                }
+                return proxy.query(_schema, command, {dht::partition_range::make_singular(key.partition)}, options.get_consistency(), {timeout, state.get_trace_state()})
+                .then([] (service::storage_proxy::coordinator_query_result qr) {
+                    return std::move(qr.query_result);
+                });
+            }, std::move(oneshot_merger)).then([&key_it, key_it_end = std::move(key_it_end), &keys, &merger] (foreign_ptr<lw_shared_ptr<query::result>> result) {
+                bool is_short_read = result->is_short_read();
+                merger(std::move(result));
+                key_it = key_it_end;
+                return stop_iteration(is_short_read || key_it == keys.end());
+            });
+        }).then([&merger] () {
+            return merger.get();
+        });
+    }).then([this, &proxy, &state, &options, now, cmd, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
+        return this->process_base_query_results(std::move(result), cmd, proxy, state, options, now, std::move(paging_state));
+    });
+}
+
 future<shared_ptr<cql_transport::messages::result_message>>
 select_statement::execute(service::storage_proxy& proxy,
                          lw_shared_ptr<query::read_command> cmd,
@@ -503,52 +697,21 @@ select_statement::execute(service::storage_proxy& proxy,
    }
 }

-// Function for fetching the selected columns from a list of clustering rows.
-// It is currently used only in our Secondary Index implementation - ordinary
-// CQL SELECT statements do not have the syntax to request a list of rows.
-// FIXME: The current implementation is very inefficient - it requests each
-// row separately (and all in parallel). Even multiple rows from a single
-// partition are requested separately. This last case can be easily improved,
-// but to implement the general case (multiple rows from multiple partitions)
-// efficiently, we will need more support from other layers.
-// Keys are ordered in token order (see #3423)
-future<shared_ptr<cql_transport::messages::result_message>>
-select_statement::execute(service::storage_proxy& proxy,
-                          lw_shared_ptr<query::read_command> cmd,
-                          std::vector<primary_key>&& primary_keys,
-                          service::query_state& state,
-                          const query_options& options,
-                          gc_clock::time_point now)
+shared_ptr<cql_transport::messages::result_message>
+indexed_table_select_statement::process_base_query_results(
+        foreign_ptr<lw_shared_ptr<query::result>> results,
+        lw_shared_ptr<query::read_command> cmd,
+        service::storage_proxy& proxy,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state)
 {
-    // FIXME: pass the timeout from caller. The query has already started
-    // earlier (with read_posting_list()), not now.
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
-    return do_with(std::move(primary_keys), [this, &proxy, &state, &options, cmd, timeout] (auto& keys) {
-        assert(cmd->partition_limit == query::max_partitions);
-        query::result_merger merger(cmd->row_limit, query::max_partitions);
-        // there is no point to produce rows beyond the first row_limit:
-        auto end = keys.size() <= cmd->row_limit ? keys.end() : keys.begin() + cmd->row_limit;
-        return map_reduce(keys.begin(), end, [this, &proxy, &state, &options, cmd, timeout] (auto& key) {
-            auto command = ::make_lw_shared<query::read_command>(*cmd);
-            // for each partition, read just one clustering row (TODO: can
-            // get all needed rows of one partition at once.)
-            command->slice._row_ranges.clear();
-            if (key.clustering) {
-                command->slice._row_ranges.push_back(query::clustering_range::make_singular(key.clustering));
-            }
-            return proxy.query(_schema,
-                    command,
-                    {dht::partition_range::make_singular(key.partition)},
-                    options.get_consistency(),
-                    {timeout, state.get_trace_state()}).then([] (service::storage_proxy::coordinator_query_result qr) {
-                return std::move(qr.query_result);
-            });
-        }, std::move(merger));
-    }).then([this, &options, now, cmd] (auto result) {
-        // note that cmd here still has the garbage clustering range in slice,
-        // but process_results() ignores this part of the slice setting.
-        return this->process_results(std::move(result), cmd, options, now);
-    });
+    if (paging_state) {
+        paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, proxy, state, options);
+        _selection->get_result_metadata()->maybe_set_paging_state(std::move(paging_state));
+    }
+    return process_results(std::move(results), std::move(cmd), options, now);
 }

 shared_ptr<cql_transport::messages::result_message>
@@ -557,7 +720,8 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
                                  const query_options& options,
                                  gc_clock::time_point now)
 {
-    bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !_restrictions->need_filtering();
+    const bool restrictions_need_filtering = _restrictions->need_filtering();
+    const bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !restrictions_need_filtering;
    if (fast_path) {
        return make_shared<cql_transport::messages::result_message::rows>(result(
            result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
@@ -567,12 +731,12 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu

    cql3::selection::result_set_builder builder(*_selection, now,
            options.get_cql_serialization_format());
-    if (_restrictions->need_filtering()) {
+    if (restrictions_need_filtering) {
        results->ensure_counts();
        _stats.filtered_rows_read_total += *results->row_count();
        query::result_view::consume(*results, cmd->slice,
                cql3::selection::result_set_builder::visitor(builder, *_schema,
-                        *_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions)));
+                        *_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
    } else {
        query::result_view::consume(*results, cmd->slice,
                cql3::selection::result_set_builder::visitor(builder, *_schema,
@@ -588,7 +752,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
        rs->trim(cmd->row_limit);
    }
    update_stats_rows_read(rs->size());
-    _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
+    _stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
    return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
 }

@@ -617,10 +781,16 @@ indexed_table_select_statement::prepare(database& db,
                                        ordering_comparator_type ordering_comparator,
                                        ::shared_ptr<term> limit, cql_stats &stats)
 {
-    auto index_opt = find_idx(db, schema, restrictions);
+    auto& sim = db.find_column_family(schema).get_index_manager();
+    auto index_opt = restrictions->find_idx(sim);
    if (!index_opt) {
        throw std::runtime_error("No index found.");
    }
+
+    const auto& im = index_opt->metadata();
+    sstring index_table_name = im.name() + "_index";
+    schema_ptr view_schema = db.find_schema(schema->ks_name(), index_table_name);
+
    return ::make_shared<cql3::statements::indexed_table_select_statement>(
            schema,
            bound_terms,
@@ -631,28 +801,11 @@ indexed_table_select_statement::prepare(database& db,
            std::move(ordering_comparator),
            limit,
            stats,
-            *index_opt);
+            *index_opt,
+            view_schema);

 }

-
-stdx::optional<secondary_index::index> indexed_table_select_statement::find_idx(database& db,
-                                                                                schema_ptr schema,
-                                                                                ::shared_ptr<restrictions::statement_restrictions> restrictions)
-{
-    auto& sim = db.find_column_family(schema).get_index_manager();
-    for (::shared_ptr<cql3::restrictions::restrictions> restriction : restrictions->index_restrictions()) {
-        for (const auto& cdef : restriction->get_column_defs()) {
-            for (auto index : sim.list_indexes()) {
-                if (index.depends_on(*cdef)) {
-                    return stdx::make_optional<secondary_index::index>(std::move(index));
-                }
-            }
-        }
-    }
-    return stdx::nullopt;
-}
-
 indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
                                                           ::shared_ptr<parameters> parameters,
                                                           ::shared_ptr<selection::selection> selection,
@@ -660,16 +813,74 @@ indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema
                                                           bool is_reversed,
                                                           ordering_comparator_type ordering_comparator,
                                                           ::shared_ptr<term> limit, cql_stats &stats,
-                                                           const secondary_index::index& index)
+                                                           const secondary_index::index& index,
+                                                           schema_ptr view_schema)
    : select_statement{schema, bound_terms, parameters, selection, restrictions, is_reversed, ordering_comparator, limit, stats}
    , _index{index}
+    , _view_schema(view_schema)
 {}

+template<typename KeyType>
+GCC6_CONCEPT(
+    requires (std::is_same_v<KeyType, partition_key> || std::is_same_v<KeyType, clustering_key_prefix>)
+)
+static void append_base_key_to_index_ck(std::vector<bytes_view>& exploded_index_ck, const KeyType& base_key, const column_definition& index_cdef) {
+    auto key_view = base_key.view();
+    auto begin = key_view.begin();
+    if ((std::is_same_v<KeyType, partition_key> && index_cdef.is_partition_key())
+            || (std::is_same_v<KeyType, clustering_key_prefix> && index_cdef.is_clustering_key())) {
+        auto key_position = std::next(begin, index_cdef.id);
+        std::move(begin, key_position, std::back_inserter(exploded_index_ck));
+        begin = std::next(key_position);
+    }
+    std::move(begin, key_view.end(), std::back_inserter(exploded_index_ck));
+}
+
+::shared_ptr<const service::pager::paging_state> indexed_table_select_statement::generate_view_paging_state_from_base_query_results(::shared_ptr<const service::pager::paging_state> paging_state,
+        const foreign_ptr<lw_shared_ptr<query::result>>& results, service::storage_proxy& proxy, service::query_state& state, const query_options& options) const {
+    const column_definition* cdef = _schema->get_column_definition(to_bytes(_index.target_column()));
+    if (!cdef) {
+        throw exceptions::invalid_request_exception("Indexed column not found in schema");
+    }
+
+    //NOTICE(sarna): Executing indexed_table branch implies there was at least 1 index restriction present
+    bytes_opt index_pk_value = _restrictions->index_restrictions().front()->value_for(*cdef, options);
+    auto index_pk = partition_key::from_single_value(*_view_schema, *index_pk_value);
+    auto result_view = query::result_view(*results);
+    if (!results->row_count() || *results->row_count() == 0) {
+        return std::move(paging_state);
+    }
+    auto [last_base_pk, last_base_ck] = result_view.get_last_partition_and_clustering_key();
+
+    std::vector<bytes_view> exploded_index_ck;
+    exploded_index_ck.reserve(_view_schema->clustering_key_size());
+
+    dht::i_partitioner& partitioner = dht::global_partitioner();
+    bytes token_bytes = partitioner.token_to_bytes(partitioner.get_token(*_schema, last_base_pk));
+    exploded_index_ck.push_back(bytes_view(token_bytes));
+    append_base_key_to_index_ck<partition_key>(exploded_index_ck, last_base_pk, *cdef);
+    if (last_base_ck) {
+        append_base_key_to_index_ck<clustering_key>(exploded_index_ck, *last_base_ck, *cdef);
+    }
+
+    auto index_ck = clustering_key::from_range(std::move(exploded_index_ck));
+    if (partition_key::tri_compare(*_view_schema)(paging_state->get_partition_key(), index_pk) == 0
+            && (!paging_state->get_clustering_key() || clustering_key::prefix_equal_tri_compare(*_view_schema)(*paging_state->get_clustering_key(), index_ck) == 0)) {
+        return std::move(paging_state);
+    }
+
+    auto paging_state_copy = ::make_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
+    paging_state_copy->set_partition_key(std::move(index_pk));
+    paging_state_copy->set_clustering_key(std::move(index_ck));
+    return std::move(paging_state_copy);
+}
+
 future<shared_ptr<cql_transport::messages::result_message>>
 indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
                             service::query_state& state,
                             const query_options& options)
 {
+    tracing::add_table_name(state.get_trace_state(), _view_schema->ks_name(), _view_schema->cf_name());
    tracing::add_table_name(state.get_trace_state(), keyspace(), column_family());

    auto cl = options.get_consistency();
@@ -684,6 +895,8 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,

    assert(_restrictions->uses_secondary_indexing());

+    _stats.unpaged_select_queries += options.get_page_size() <= 0;
+
    // Secondary index search has two steps: 1. use the index table to find a
    // list of primary keys matching the query. 2. read the rows matching
    // these primary keys from the base table and return the selected columns.
@@ -719,120 +932,142 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
    if (whole_partitions || partition_slices) {
        // In this case, can use our normal query machinery, which retrieves
        // entire partitions or the same slice for many partitions.
-        return find_index_partition_ranges(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (dht::partition_range_vector partition_ranges) {
-            auto command = ::make_lw_shared<query::read_command>(
-                _schema->id(),
-                _schema->version(),
-                make_partition_slice(options),
-                limit,
-                now,
-                tracing::make_trace_info(state.get_trace_state()),
-                query::max_partitions,
-                utils::UUID(),
-                options.get_timestamp(state));
-            return this->execute(proxy, command, std::move(partition_ranges), state, options, now);
+        return find_index_partition_ranges(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (dht::partition_range_vector partition_ranges, ::shared_ptr<const service::pager::paging_state> paging_state) {
+            return this->execute_base_query(proxy, std::move(partition_ranges), state, options, now, std::move(paging_state));
        });
    } else {
        // In this case, we need to retrieve a list of rows (not entire
        // partitions) and then retrieve those specific rows.
-        return find_index_clustering_rows(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (std::vector<primary_key> primary_keys) {
-            auto command = ::make_lw_shared<query::read_command>(
-                _schema->id(),
-                _schema->version(),
-                // Note: the "clustering bounds" set in make_partition_slice()
-                // here is garbage, and will be overridden by execute() anyway
-                make_partition_slice(options),
-                limit,
-                now,
-                tracing::make_trace_info(state.get_trace_state()),
-                query::max_partitions,
-                utils::UUID(),
-                options.get_timestamp(state));
-            return this->execute(proxy, command, std::move(primary_keys), state, options, now);
+        return find_index_clustering_rows(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (std::vector<primary_key> primary_keys, ::shared_ptr<const service::pager::paging_state> paging_state) {
+            return this->execute_base_query(proxy, std::move(primary_keys), state, options, now, std::move(paging_state));
        });
    }
 }

-// Utility function for getting the schema of the materialized view used for
-// the secondary index implementation.
-static schema_ptr
-get_index_schema(service::storage_proxy& proxy,
-                const secondary_index::index& index,
-                const schema_ptr& schema,
-                tracing::trace_state_ptr& trace_state)
-{
-    const auto& im = index.metadata();
-    sstring index_table_name = im.name() + "_index";
-    tracing::add_table_name(trace_state, schema->ks_name(), index_table_name);
-    return proxy.get_db().local().find_schema(schema->ks_name(), index_table_name);
-}
-
 // Utility function for reading from the index view (get_index_view()))
 // the posting-list for a particular value of the indexed column.
 // Remember a secondary index can only be created on a single column.
-static future<service::storage_proxy::coordinator_query_result>
+template<typename KeyType>
+GCC6_CONCEPT(
+    requires (std::is_same_v<KeyType, partition_key> || std::is_same_v<KeyType, clustering_key>)
+)
+static future<::shared_ptr<cql_transport::messages::result_message::rows>>
 read_posting_list(service::storage_proxy& proxy,
                  schema_ptr view_schema,
-                  const std::vector<::shared_ptr<restrictions::restrictions>>& index_restrictions,
+                  schema_ptr base_schema,
+                  const secondary_index::index& index,
+                  ::shared_ptr<restrictions::statement_restrictions> base_restrictions,
                  const query_options& options,
                  int32_t limit,
                  service::query_state& state,
                  gc_clock::time_point now,
-                  db::timeout_clock::time_point timeout)
+                  db::timeout_clock::time_point timeout,
+                  cql3::cql_stats& stats)
 {
    dht::partition_range_vector partition_ranges;
    // FIXME: there should be only one index restriction for this index!
    // Perhaps even one index restriction entirely (do we support
    // intersection queries?).
-    for (const auto& restriction : index_restrictions) {
-        auto pk = partition_key::from_optional_exploded(*view_schema, restriction->values(options));
-        auto dk = dht::global_partitioner().decorate_key(*view_schema, pk);
-        auto range = dht::partition_range::make_singular(dk);
-        partition_ranges.emplace_back(range);
+    for (const auto& restrictions : base_restrictions->index_restrictions()) {
+        const column_definition* cdef = base_schema->get_column_definition(to_bytes(index.target_column()));
+        if (!cdef) {
+            throw exceptions::invalid_request_exception("Indexed column not found in schema");
+        }
+
+        bytes_opt value = restrictions->value_for(*cdef, options);
+        if (value) {
+            auto pk = partition_key::from_single_value(*view_schema, *value);
+            auto dk = dht::global_partitioner().decorate_key(*view_schema, pk);
+            auto range = dht::partition_range::make_singular(dk);
+            partition_ranges.emplace_back(range);
+        }
    }
+
    partition_slice_builder partition_slice_builder{*view_schema};
+
+    if (!base_restrictions->has_partition_key_unrestricted_components()) {
+        auto single_pk_restrictions = dynamic_pointer_cast<restrictions::single_column_primary_key_restrictions<partition_key>>(base_restrictions->get_partition_key_restrictions());
+        // Only EQ restrictions on base partition key can be used in an index view query
+        if (single_pk_restrictions && single_pk_restrictions->is_all_eq()) {
+            auto clustering_restrictions = ::make_shared<restrictions::single_column_primary_key_restrictions<clustering_key_prefix>>(view_schema, *single_pk_restrictions);
+            // Computed token column needs to be added to index view restrictions
+            const column_definition& token_cdef = *view_schema->clustering_key_columns().begin();
+            auto base_pk = partition_key::from_optional_exploded(*base_schema, base_restrictions->get_partition_key_restrictions()->values(options));
+            bytes token_value = dht::global_partitioner().token_to_bytes(dht::global_partitioner().get_token(*base_schema, base_pk));
+            auto token_restriction = ::make_shared<restrictions::single_column_restriction::EQ>(token_cdef, ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(token_value)));
+            clustering_restrictions->merge_with(token_restriction);
+
+            if (base_restrictions->get_clustering_columns_restrictions()->prefix_size() > 0) {
+                auto single_ck_restrictions = dynamic_pointer_cast<restrictions::single_column_primary_key_restrictions<clustering_key>>(base_restrictions->get_clustering_columns_restrictions());
+                if (single_ck_restrictions) {
+                    auto prefix_restrictions = single_ck_restrictions->get_longest_prefix_restrictions();
+                    auto clustering_restrictions_from_base = ::make_shared<restrictions::single_column_primary_key_restrictions<clustering_key_prefix>>(view_schema, *prefix_restrictions);
+                    for (auto restriction_it : clustering_restrictions_from_base->restrictions()) {
+                        clustering_restrictions->merge_with(restriction_it.second);
+                    }
+                }
+            }
+
+            partition_slice_builder.with_ranges(clustering_restrictions->bounds_ranges(options));
+        }
+    }
+
+    auto partition_slice = partition_slice_builder.build();
    auto cmd = ::make_lw_shared<query::read_command>(
            view_schema->id(),
            view_schema->version(),
-            partition_slice_builder.build(),
+            partition_slice,
            limit,
            now,
            tracing::make_trace_info(state.get_trace_state()),
            query::max_partitions,
            utils::UUID(),
            options.get_timestamp(state));
-    return proxy.query(view_schema,
-            cmd,
-            std::move(partition_ranges),
-            options.get_consistency(),
-            {timeout, state.get_trace_state()});
+
+    std::vector<const column_definition*> columns;
+    for (const column_definition& cdef : base_schema->partition_key_columns()) {
+        columns.emplace_back(view_schema->get_column_definition(cdef.name()));
+    }
+    if constexpr (std::is_same_v<KeyType, clustering_key>) {
+        for (const column_definition& cdef : base_schema->clustering_key_columns()) {
+            columns.emplace_back(view_schema->get_column_definition(cdef.name()));
+        }
+    }
+    auto selection = selection::selection::for_columns(view_schema, columns);
+
+    int32_t page_size = options.get_page_size();
+    if (page_size <= 0 || !service::pager::query_pagers::may_need_paging(*view_schema, page_size, *cmd, partition_ranges)) {
+        stats.unpaged_select_queries += 1;
+        return proxy.query(view_schema, cmd, std::move(partition_ranges), options.get_consistency(), {timeout, state.get_trace_state()})
+        .then([base_schema, view_schema, now, &options, selection = std::move(selection), partition_slice = std::move(partition_slice)] (service::storage_proxy::coordinator_query_result qr) {
+            cql3::selection::result_set_builder builder(*selection, now, options.get_cql_serialization_format());
+            query::result_view::consume(*qr.query_result,
+                                        std::move(partition_slice),
+                                        cql3::selection::result_set_builder::visitor(builder, *view_schema, *selection));
+            return ::make_shared<cql_transport::messages::result_message::rows>(std::move(result(builder.build())));
+        });
+    }
+
+    auto p = service::pager::query_pagers::pager(view_schema, selection,
+            state, options, cmd, std::move(partition_ranges), stats, nullptr);
+    return p->fetch_page(options.get_page_size(), now, timeout).then([p, &options, limit, now] (std::unique_ptr<cql3::result_set> rs) {
+        rs->get_metadata().set_paging_state(p->state());
+        return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
+    });
 }

 // Note: the partitions keys returned by this function are sorted
 // in token order. See issue #3423.
-future<dht::partition_range_vector>
+future<dht::partition_range_vector, ::shared_ptr<const service::pager::paging_state>>
 indexed_table_select_statement::find_index_partition_ranges(service::storage_proxy& proxy,
                                             service::query_state& state,
                                             const query_options& options)
 {
-    schema_ptr view = get_index_schema(proxy, _index, _schema, state.get_trace_state());
    auto now = gc_clock::now();
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
-    return read_posting_list(proxy, view, _restrictions->index_restrictions(), options, get_limit(options), state, now, timeout).then(
-            [this, now, &options, view] (service::storage_proxy::coordinator_query_result qr) {
-        std::vector<const column_definition*> columns;
-        for (const column_definition& cdef : _schema->partition_key_columns()) {
-            columns.emplace_back(view->get_column_definition(cdef.name()));
-        }
-        auto selection = selection::selection::for_columns(view, columns);
-        cql3::selection::result_set_builder builder(*selection, now, options.get_cql_serialization_format());
-        // FIXME: read_posting_list already asks to read primary keys only.
-        // why do we need to specify this again?
-        auto slice = partition_slice_builder(*view).build();
-        query::result_view::consume(*qr.query_result,
-                                    slice,
-                                    cql3::selection::result_set_builder::visitor(builder, *view, *selection));
-        auto rs = cql3::untyped_result_set(::make_shared<cql_transport::messages::result_message::rows>(std::move(result(builder.build()))));
+    return read_posting_list<partition_key>(proxy, _view_schema, _schema, _index, _restrictions, options, get_limit(options), state, now, timeout, _stats).then(
+            [this, now, &options] (::shared_ptr<cql_transport::messages::result_message::rows> rows) {
+        auto rs = cql3::untyped_result_set(rows);
        dht::partition_range_vector partition_ranges;
        partition_ranges.reserve(rs.size());
        // We are reading the list of primary keys as rows of a single
@@ -858,36 +1093,22 @@ indexed_table_select_statement::find_index_partition_ranges(service::storage_pro
            auto range = dht::partition_range::make_singular(dk);
            partition_ranges.emplace_back(range);
        }
-        return partition_ranges;
+        auto paging_state = rows->rs().get_metadata().paging_state();
+        return make_ready_future<dht::partition_range_vector, ::shared_ptr<const service::pager::paging_state>>(std::move(partition_ranges), std::move(paging_state));
    });
 }

 // Note: the partitions keys returned by this function are sorted
 // in token order. See issue #3423.
-future<std::vector<indexed_table_select_statement::primary_key>>
+future<std::vector<indexed_table_select_statement::primary_key>, ::shared_ptr<const service::pager::paging_state>>
 indexed_table_select_statement::find_index_clustering_rows(service::storage_proxy& proxy, service::query_state& state, const query_options& options)
 {
-    schema_ptr view = get_index_schema(proxy, _index, _schema, state.get_trace_state());
    auto now = gc_clock::now();
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
-    return read_posting_list(proxy, view, _restrictions->index_restrictions(), options, get_limit(options), state, now, timeout).then(
-            [this, now, &options, view] (service::storage_proxy::coordinator_query_result qr) {
-        std::vector<const column_definition*> columns;
-        for (const column_definition& cdef : _schema->partition_key_columns()) {
-            columns.emplace_back(view->get_column_definition(cdef.name()));
-        }
-        for (const column_definition& cdef : _schema->clustering_key_columns()) {
-            columns.emplace_back(view->get_column_definition(cdef.name()));
-        }
-        auto selection = selection::selection::for_columns(view, columns);
-        cql3::selection::result_set_builder builder(*selection, now, options.get_cql_serialization_format());
-        // FIXME: read_posting_list already asks to read primary keys only.
-        // why do we need to specify this again?
-        auto slice = partition_slice_builder(*view).build();
-        query::result_view::consume(*qr.query_result,
-                                    slice,
-                                    cql3::selection::result_set_builder::visitor(builder, *view, *selection));
-        auto rs = cql3::untyped_result_set(::make_shared<cql_transport::messages::result_message::rows>(result(builder.build())));
+    return read_posting_list<clustering_key>(proxy, _view_schema, _schema, _index, _restrictions, options, get_limit(options), state, now, timeout, _stats).then(
+            [this, now, &options] (::shared_ptr<cql_transport::messages::result_message::rows> rows) {
+
+        auto rs = cql3::untyped_result_set(rows);
        std::vector<primary_key> primary_keys;
        primary_keys.reserve(rs.size());
        for (size_t i = 0; i < rs.size(); i++) {
@@ -903,7 +1124,8 @@ indexed_table_select_statement::find_index_clustering_rows(service::storage_prox
            auto ck = clustering_key::from_range(ck_columns);
            primary_keys.emplace_back(primary_key{std::move(dk), std::move(ck)});
        }
-        return primary_keys;
+        auto paging_state = rows->rs().get_metadata().paging_state();
+        return make_ready_future<std::vector<indexed_table_select_statement::primary_key>, ::shared_ptr<const service::pager::paging_state>>(std::move(primary_keys), std::move(paging_state));
    });
 }

@@ -986,6 +1208,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
    }

    check_needs_filtering(restrictions);
+    ensure_filtering_columns_retrieval(db, selection, restrictions);

    ::shared_ptr<cql3::statements::select_statement> stmt;
    if (restrictions->uses_secondary_indexing()) {
@@ -1124,7 +1347,7 @@ select_statement::get_ordering_comparator(schema_ptr schema,
        }
        auto index = selection->index_of(*def);
        if (index < 0) {
-            index = selection->add_column_for_ordering(*def);
+            index = selection->add_column_for_post_processing(*def);
        }

        sorters.emplace_back(index, def->type);
@@ -1211,6 +1434,23 @@ void select_statement::check_needs_filtering(::shared_ptr<restrictions::statemen
    }
 }

+/**
+ * Adds columns that are needed for the purpose of filtering to the selection.
+ * The columns that are added to the selection are columns that
+ * are needed for filtering on the coordinator but are not part of the selection.
+ * The columns are added with a meta-data indicating they are not to be returned
+ * to the user.
+ */
+void select_statement::ensure_filtering_columns_retrieval(database& db,
+                                        ::shared_ptr<selection::selection> selection,
+                                        ::shared_ptr<restrictions::statement_restrictions> restrictions) {
+    for (auto&& cdef : restrictions->get_column_defs_for_filtering(db)) {
+        if (!selection->has_column(*cdef)) {
+            selection->add_column_for_post_processing(*cdef);
+        }
+    }
+}
+
 bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
    return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
        return raw->alias && *name == *raw->alias;
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -126,14 +126,6 @@ public:
        clustering_key_prefix clustering;
    };

-    future<::shared_ptr<cql_transport::messages::result_message>> execute(
-            service::storage_proxy& proxy,
-            lw_shared_ptr<query::read_command> cmd,
-            std::vector<primary_key>&& primary_keys,
-            service::query_state& state,
-            const query_options& options,
-            gc_clock::time_point now);
-
    shared_ptr<cql_transport::messages::result_message> process_results(foreign_ptr<lw_shared_ptr<query::result>> results,
        lw_shared_ptr<query::read_command> cmd, const query_options& options, gc_clock::time_point now);

@@ -168,6 +160,7 @@ public:

 class indexed_table_select_statement : public select_statement {
    secondary_index::index _index;
+    schema_ptr _view_schema;
 public:
    static ::shared_ptr<cql3::statements::select_statement> prepare(database& db,
                                                                    schema_ptr schema,
@@ -189,24 +182,55 @@ public:
                                   ordering_comparator_type ordering_comparator,
                                   ::shared_ptr<term> limit,
                                   cql_stats &stats,
-                                   const secondary_index::index& index);
+                                   const secondary_index::index& index,
+                                   schema_ptr view_schema);

 private:
-    static stdx::optional<secondary_index::index> find_idx(database& db,
-                                                           schema_ptr schema,
-                                                           ::shared_ptr<restrictions::statement_restrictions> restrictions);
-
    virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
                                                                                     service::query_state& state, const query_options& options) override;

-    future<dht::partition_range_vector> find_index_partition_ranges(service::storage_proxy& proxy,
+    ::shared_ptr<const service::pager::paging_state> generate_view_paging_state_from_base_query_results(::shared_ptr<const service::pager::paging_state> paging_state,
+            const foreign_ptr<lw_shared_ptr<query::result>>& results, service::storage_proxy& proxy, service::query_state& state, const query_options& options) const;
+
+    future<dht::partition_range_vector, ::shared_ptr<const service::pager::paging_state>> find_index_partition_ranges(service::storage_proxy& proxy,
                                                                    service::query_state& state,
                                                                    const query_options& options);

-    future<std::vector<primary_key>> find_index_clustering_rows(service::storage_proxy& proxy,
+    future<std::vector<primary_key>, ::shared_ptr<const service::pager::paging_state>> find_index_clustering_rows(service::storage_proxy& proxy,
                                                                service::query_state& state,
                                                                const query_options& options);

+    shared_ptr<cql_transport::messages::result_message>
+    process_base_query_results(
+            foreign_ptr<lw_shared_ptr<query::result>> results,
+            lw_shared_ptr<query::read_command> cmd,
+            service::storage_proxy& proxy,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
+
+    lw_shared_ptr<query::read_command>
+    prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);
+
+    future<shared_ptr<cql_transport::messages::result_message>>
+    execute_base_query(
+            service::storage_proxy& proxy,
+            dht::partition_range_vector&& partition_ranges,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
+
+    future<shared_ptr<cql_transport::messages::result_message>>
+    execute_base_query(
+            service::storage_proxy& proxy,
+            std::vector<primary_key>&& primary_keys,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
+
    virtual void update_stats_rows_read(int64_t rows_read) override {
        _stats.rows_read += rows_read;
        _stats.secondary_index_rows_read += rows_read;
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -84,8 +84,11 @@ parse(const sstring& json_string, const std::vector<column_definition>& expected
    for (const auto& def : expected_receivers) {
        sstring cql_name = def.name_as_text();
        auto value_it = prepared_map.find(cql_name);
-        if (value_it == prepared_map.end() || value_it->second.isNull()) {
+        if (value_it == prepared_map.end()) {
+            continue;
+        } else if (value_it->second.isNull()) {
            json_map.emplace(std::move(cql_name), bytes_opt{});
+            prepared_map.erase(value_it);
        } else {
            json_map.emplace(std::move(cql_name), def.type->from_json_object(value_it->second, sf));
            prepared_map.erase(value_it);
@@ -172,29 +175,45 @@ void update_statement::add_update_for_key(mutation& m, const query::clustering_r
 }

 modification_statement::json_cache_opt insert_prepared_json_statement::maybe_prepare_json_cache(const query_options& options) {
-    sstring json_string = utf8_type->to_string(_term->bind_and_get(options).data().value().to_string());
+    sstring json_string = with_linearized(_term->bind_and_get(options).data().value(), [&] (bytes_view value) {
+        return utf8_type->to_string(value.to_string());
+    });
    return json_helpers::parse(std::move(json_string), s->all_columns(), options.get_cql_serialization_format());
 }

 void
 insert_prepared_json_statement::execute_set_value(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const column_definition& column, const bytes_opt& value) {
    if (!value) {
+        if (column.type->is_collection()) {
+            auto& k = static_pointer_cast<const collection_type_impl>(column.type)->_kind;
+            if (&k == &collection_type_impl::kind::list) {
+                lists::setter::execute(m, prefix, params, column, make_shared<lists::value>(lists::value(std::vector<bytes_opt>())));
+            } else if (&k == &collection_type_impl::kind::set) {
+                sets::setter::execute(m, prefix, params, column, make_shared<sets::value>(sets::value(std::set<bytes, serialized_compare>(serialized_compare(empty_type)))));
+            } else if (&k == &collection_type_impl::kind::map) {
+                maps::setter::execute(m, prefix, params, column, make_shared<maps::value>(maps::value(std::map<bytes, bytes, serialized_compare>(serialized_compare(empty_type)))));
+            } else {
+                throw exceptions::invalid_request_exception("Incorrect value kind in JSON INSERT statement");
+            }
+            return;
+        }
        m.set_cell(prefix, column, std::move(operation::make_dead_cell(params)));
+        return;
    } else if (!column.type->is_collection()) {
-        constants::setter::execute(m, prefix, params, column, raw_value_view::make_value(bytes_view(*value)));
+        constants::setter::execute(m, prefix, params, column, raw_value_view::make_value(fragmented_temporary_buffer::view(*value)));
        return;
    }

    auto& k = static_pointer_cast<const collection_type_impl>(column.type)->_kind;
    cql_serialization_format sf = params._options.get_cql_serialization_format();
    if (&k == &collection_type_impl::kind::list) {
-        auto list_terminal = make_shared<lists::value>(lists::value::from_serialized(*value, dynamic_pointer_cast<const list_type_impl>(column.type), sf));
+        auto list_terminal = make_shared<lists::value>(lists::value::from_serialized(fragmented_temporary_buffer::view(*value), dynamic_pointer_cast<const list_type_impl>(column.type), sf));
        lists::setter::execute(m, prefix, params, column, std::move(list_terminal));
    } else if (&k == &collection_type_impl::kind::set) {
-        auto set_terminal = make_shared<sets::value>(sets::value::from_serialized(*value, dynamic_pointer_cast<const set_type_impl>(column.type), sf));
+        auto set_terminal = make_shared<sets::value>(sets::value::from_serialized(fragmented_temporary_buffer::view(*value), dynamic_pointer_cast<const set_type_impl>(column.type), sf));
        sets::setter::execute(m, prefix, params, column, std::move(set_terminal));
    } else if (&k == &collection_type_impl::kind::map) {
-        auto map_terminal = make_shared<maps::value>(maps::value::from_serialized(*value, dynamic_pointer_cast<const map_type_impl>(column.type), sf));
+        auto map_terminal = make_shared<maps::value>(maps::value::from_serialized(fragmented_temporary_buffer::view(*value), dynamic_pointer_cast<const map_type_impl>(column.type), sf));
        maps::setter::execute(m, prefix, params, column, std::move(map_terminal));
    } else {
        throw exceptions::invalid_request_exception("Incorrect value kind in JSON INSERT statement");
@@ -204,15 +223,17 @@ insert_prepared_json_statement::execute_set_value(mutation& m, const clustering_
 dht::partition_range_vector
 insert_prepared_json_statement::build_partition_keys(const query_options& options, const json_cache_opt& json_cache) {
    dht::partition_range_vector ranges;
+    std::vector<bytes_opt> exploded;
    for (const auto& def : s->partition_key_columns()) {
        auto json_value = json_cache->at(def.name_as_text());
-        auto k = query::range<partition_key>::make_singular(partition_key::from_single_value(*s, json_value.value()));
-        ranges.emplace_back(std::move(k).transform(
-                    [this] (partition_key&& k) -> query::ring_position {
-                        auto token = dht::global_partitioner().get_token(*s, k);
-                        return { std::move(token), std::move(k) };
-                    }));
+        if (!json_value) {
+            throw exceptions::invalid_request_exception(sprint("Missing mandatory PRIMARY KEY part %s", def.name_as_text()));
+        }
+        exploded.emplace_back(*json_value);
    }
+    auto pkey = partition_key::from_optional_exploded(*s, std::move(exploded));
+    auto k = query::range<query::ring_position>::make_singular(dht::global_partitioner().decorate_key(*s, std::move(pkey)));
+    ranges.emplace_back(std::move(k));
    return ranges;
 }

@@ -221,7 +242,10 @@ query::clustering_row_ranges insert_prepared_json_statement::create_clustering_r
    std::vector<bytes_opt> exploded;
    for (const auto& def : s->clustering_key_columns()) {
        auto json_value = json_cache->at(def.name_as_text());
-        exploded.emplace_back(json_value.value());
+        if (!json_value) {
+            throw exceptions::invalid_request_exception(sprint("Missing mandatory PRIMARY KEY part %s", def.name_as_text()));
+        }
+        exploded.emplace_back(*json_value);
    }
    auto k = query::range<clustering_key_prefix>::make_singular(clustering_key_prefix::from_optional_exploded(*s, std::move(exploded)));
    ranges.emplace_back(query::clustering_range(std::move(k)));
@@ -234,8 +258,12 @@ void insert_prepared_json_statement::execute_operations_for_key(mutation& m, con
            throw exceptions::invalid_request_exception(sprint("Cannot set the value of counter column %s in JSON", def.name_as_text()));
        }

-        auto value = json_cache->at(def.name_as_text());
-        execute_set_value(m, prefix, params, def, value);
+        auto it = json_cache->find(def.name_as_text());
+        if (it != json_cache->end()) {
+            execute_set_value(m, prefix, params, def, it->second);
+        } else if (!_default_unset) {
+            execute_set_value(m, prefix, params, def, bytes_opt{});
+        }
    }
 }

@@ -301,12 +329,14 @@ insert_statement::prepare_internal(database& db, schema_ptr schema,
 insert_json_statement::insert_json_statement(  ::shared_ptr<cf_name> name,
                                               ::shared_ptr<attributes::raw> attrs,
                                               ::shared_ptr<term::raw> json_value,
-                                               bool if_not_exists)
+                                               bool if_not_exists,
+                                               bool default_unset)
    : raw::modification_statement{name, attrs, conditions_vector{}, if_not_exists, false}
    , _name(name)
    , _attrs(attrs)
    , _json_value(json_value)
-    , _if_not_exists(if_not_exists) { }
+    , _if_not_exists(if_not_exists)
+    , _default_unset(default_unset) { }

 ::shared_ptr<cql3::statements::modification_statement>
 insert_json_statement::prepare_internal(database& db, schema_ptr schema,
@@ -316,7 +346,7 @@ insert_json_statement::prepare_internal(database& db, schema_ptr schema,
    auto json_column_placeholder = ::make_shared<column_identifier>("", true);
    auto prepared_json_value = _json_value->prepare(db, "", ::make_shared<column_specification>("", "", json_column_placeholder, utf8_type));
    prepared_json_value->collect_marker_specification(bound_names);
-    return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value));
+    return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value), _default_unset);
 }

 update_statement::update_statement(            ::shared_ptr<cf_name> name,
--- a/cql3/statements/update_statement.hh
+++ b/cql3/statements/update_statement.hh
@@ -82,9 +82,10 @@ private:
 */
 class insert_prepared_json_statement : public update_statement {
    ::shared_ptr<term> _term;
+    bool _default_unset;
 public:
-    insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t)
-        : update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t) {
+    insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t, bool default_unset)
+        : update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t), _default_unset(default_unset) {
        _restrictions = ::make_shared<restrictions::statement_restrictions>(s, false);
    }
 private:
--- a/cql3/stats.hh
+++ b/cql3/stats.hh
@@ -36,6 +36,8 @@ struct cql_stats {
    uint64_t batches_pure_unlogged = 0;
    uint64_t batches_unlogged_from_logged = 0;
    uint64_t rows_read = 0;
+    uint64_t reverse_queries = 0;
+    uint64_t unpaged_select_queries = 0;

    int64_t secondary_index_creates = 0;
    int64_t secondary_index_drops = 0;
--- a/cql3/tuples.hh
+++ b/cql3/tuples.hh
@@ -159,8 +159,10 @@ public:
                _elements.push_back(e ? bytes_opt(bytes(e->begin(), e->size())) : bytes_opt());
            }
        }
-        static value from_serialized(bytes_view buffer, tuple_type type) {
-            return value(type->split(buffer));
+        static value from_serialized(const fragmented_temporary_buffer::view& buffer, tuple_type type) {
+          return with_linearized(buffer, [&] (bytes_view view) {
+            return value(type->split(view));
+          });
        }
        virtual cql3::raw_value get(const query_options& options) override {
            return cql3::raw_value::make_value(tuple_type_impl::build_value(_elements));
@@ -251,20 +253,29 @@ public:
            }
        }

-        static in_value from_serialized(bytes_view value, list_type type, const query_options& options) {
+        static in_value from_serialized(const fragmented_temporary_buffer::view& value_view, list_type type, const query_options& options) {
            try {
                // Collections have this small hack that validate cannot be called on a serialized object,
                // but the deserialization does the validation (so we're fine).
+              return with_linearized(value_view, [&] (bytes_view value) {
                auto l = value_cast<list_type_impl::native_type>(type->deserialize(value, options.get_cql_serialization_format()));
                auto ttype = dynamic_pointer_cast<const tuple_type_impl>(type->get_elements_type());
                assert(ttype);

-                std::vector<std::vector<bytes_view_opt>> elements;
+                std::vector<std::vector<bytes_opt>> elements;
                elements.reserve(l.size());
                for (auto&& element : l) {
-                    elements.emplace_back(ttype->split(ttype->decompose(element)));
+                    auto tuple_buff = ttype->decompose(element);
+                    auto tuple = ttype->split(tuple_buff);
+                    std::vector<bytes_opt> elems;
+                    elems.reserve(tuple.size());
+                    for (auto&& e : tuple) {
+                        elems.emplace_back(to_bytes_opt(e));
+                    }
+                    elements.emplace_back(std::move(elems));
                }
                return in_value(elements);
+              });
            } catch (marshal_exception& e) {
                throw exceptions::invalid_request_exception(e.what());
            }
@@ -405,7 +416,7 @@ public:
        in_marker(int32_t bind_index, ::shared_ptr<column_specification> receiver)
            : abstract_marker(bind_index, std::move(receiver))
        {
-            assert(dynamic_pointer_cast<const list_type_impl>(receiver->type));
+            assert(dynamic_pointer_cast<const list_type_impl>(_receiver->type));
        }

        virtual shared_ptr<terminal> bind(const query_options& options) override {
--- a/cql3/update_parameters.cc
+++ b/cql3/update_parameters.cc
@@ -53,6 +53,9 @@ update_parameters::get_prefetched_list(
        return {};
    }

+    if (column.is_static()) {
+        ckey = clustering_key_view::make_empty();
+    }
    auto i = _prefetched->rows.find(std::make_pair(std::move(pkey), std::move(ckey)));
    if (i == _prefetched->rows.end()) {
        return {};
--- a/cql3/update_parameters.hh
+++ b/cql3/update_parameters.hh
@@ -142,7 +142,7 @@ public:
        return atomic_cell::make_dead(_timestamp, _local_deletion_time);
    }

-    atomic_cell make_cell(const abstract_type& type, bytes_view value, atomic_cell::collection_member cm = atomic_cell::collection_member::no) const {
+    atomic_cell make_cell(const abstract_type& type, const fragmented_temporary_buffer::view& value, atomic_cell::collection_member cm = atomic_cell::collection_member::no) const {
        auto ttl = _ttl;

        if (ttl.count() <= 0) {
@@ -156,6 +156,10 @@ public:
        }
    };

+    atomic_cell make_cell(const abstract_type& type, bytes_view value, atomic_cell::collection_member cm = atomic_cell::collection_member::no) const {
+        return make_cell(type, fragmented_temporary_buffer::view(value), cm);
+    }
+
    atomic_cell make_counter_update_cell(int64_t delta) const {
        return atomic_cell::make_live_counter_update(_timestamp, delta);
    }
--- a/cql3/values.hh
+++ b/cql3/values.hh
@@ -28,6 +28,10 @@

 #include <experimental/optional>

+#include <seastar/util/variant_utils.hh>
+
+#include "utils/fragmented_temporary_buffer.hh"
+
 namespace cql3 {

 struct null_value {
@@ -40,7 +44,7 @@ struct unset_value {
 ///
 /// \see raw_value
 struct raw_value_view {
-    boost::variant<bytes_view, null_value, unset_value> _data;
+    boost::variant<fragmented_temporary_buffer::view, null_value, unset_value> _data;

    raw_value_view(null_value&& data)
        : _data{std::move(data)}
@@ -48,10 +52,7 @@ struct raw_value_view {
    raw_value_view(unset_value&& data)
        : _data{std::move(data)}
    {}
-    raw_value_view(bytes_view&& data)
-        : _data{std::move(data)}
-    {}
-    raw_value_view(const bytes_view& data)
+    raw_value_view(fragmented_temporary_buffer::view data)
        : _data{data}
    {}
 public:
@@ -61,10 +62,7 @@ public:
    static raw_value_view make_unset_value() {
        return raw_value_view{std::move(unset_value{})};
    }
-    static raw_value_view make_value(bytes_view &&view) {
-        return raw_value_view{std::move(view)};
-    }
-    static raw_value_view make_value(const bytes_view& view) {
+    static raw_value_view make_value(fragmented_temporary_buffer::view view) {
        return raw_value_view{view};
    }
    bool is_null() const {
@@ -76,20 +74,47 @@ public:
    bool is_value() const {
        return _data.which() == 0;
    }
-    bytes_view_opt data() const {
+    std::optional<fragmented_temporary_buffer::view> data() const {
        if (_data.which() == 0) {
-            return boost::get<bytes_view>(_data);
+            return boost::get<fragmented_temporary_buffer::view>(_data);
        }
        return {};
    }
    explicit operator bool() const {
        return _data.which() == 0;
    }
-    const bytes_view* operator->() const {
-        return &boost::get<bytes_view>(_data);
+    const fragmented_temporary_buffer::view* operator->() const {
+        return &boost::get<fragmented_temporary_buffer::view>(_data);
    }
-    const bytes_view& operator*() const {
-        return boost::get<bytes_view>(_data);
+    const fragmented_temporary_buffer::view& operator*() const {
+        return boost::get<fragmented_temporary_buffer::view>(_data);
+    }
+
+    bool operator==(const raw_value_view& other) const {
+        if (_data.which() != other._data.which()) {
+            return false;
+        }
+        if (is_value() && **this != *other) {
+            return false;
+        }
+        return true;
+    }
+    bool operator!=(const raw_value_view& other) const {
+        return !(*this == other);
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const raw_value_view& value) {
+        seastar::visit(value._data, [&] (fragmented_temporary_buffer::view v) {
+            os << "{ value: ";
+            using boost::range::for_each;
+            for_each(v, [&os] (bytes_view bv) { os << bv; });
+            os << " }";
+        }, [&] (null_value) {
+            os << "{ null }";
+        }, [&] (unset_value) {
+            os << "{ unset }";
+        });
+        return os;
    }
 };

@@ -127,7 +152,7 @@ public:
        if (view.is_unset_value()) {
            return make_unset_value();
        }
-        return make_value(to_bytes(*view));
+        return make_value(linearized(*view));
    }
    static raw_value make_value(bytes&& bytes) {
        return raw_value{std::move(bytes)};
@@ -167,7 +192,7 @@ public:
    }
    raw_value_view to_view() const {
        switch (_data.which()) {
-        case 0:  return raw_value_view::make_value(bytes_view{boost::get<bytes>(_data)});
+        case 0:  return raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{boost::get<bytes>(_data)}));
        case 1:  return raw_value_view::make_null();
        default: return raw_value_view::make_unset_value();
        }
@@ -176,10 +201,19 @@ public:

 }

+inline bytes to_bytes(const cql3::raw_value_view& view)
+{
+    return linearized(*view);
+}
+
 inline bytes_opt to_bytes_opt(const cql3::raw_value_view& view) {
-    return to_bytes_opt(view.data());
+    auto buffer_view = view.data();
+    if (buffer_view) {
+        return bytes_opt(linearized(*buffer_view));
+    }
+    return bytes_opt();
 }

 inline bytes_opt to_bytes_opt(const cql3::raw_value& value) {
-    return value.data();
+    return to_bytes_opt(value.to_view());
 }
--- a/database.cc
+++ b/database.cc
--- a/database.hh
+++ b/database.hh
@@ -77,6 +77,7 @@
 #include <seastar/core/metrics_registration.hh>
 #include "tracing/trace_state.hh"
 #include "db/view/view.hh"
+#include "db/view/view_update_backlog.hh"
 #include "db/view/row_locking.hh"
 #include "lister.hh"
 #include "utils/phased_barrier.hh"
@@ -279,6 +280,9 @@ struct cf_stats {
    int64_t clustering_filter_fast_path_count = 0;
    // how many sstables survived the clustering key checks
    int64_t surviving_sstables_after_clustering_filter = 0;
+
+    // How many view updates were dropped due to overload.
+    int64_t dropped_view_updates = 0;
 };

 class cache_temperature {
@@ -298,9 +302,12 @@ public:
 class table;
 using column_family = table;

+class database_sstable_write_monitor;
+
 class table : public enable_lw_shared_from_this<table> {
 public:
    struct config {
+        std::vector<sstring> all_datadirs;
        sstring datadir;
        bool enable_disk_writes = true;
        bool enable_disk_reads = true;
@@ -322,6 +329,7 @@ public:
        bool enable_metrics_reporting = false;
        db::large_partition_handler* large_partition_handler;
        db::timeout_semaphore* view_update_concurrency_semaphore;
+        size_t view_update_concurrency_semaphore_limit;
    };
    struct no_commitlog {};
    struct stats {
@@ -394,7 +402,7 @@ private:
    // plan memtables and the resulting sstables are not made visible until
    // the streaming is complete.
    struct monitored_sstable {
-        std::unique_ptr<sstables::write_monitor> monitor;
+        std::unique_ptr<database_sstable_write_monitor> monitor;
        sstables::shared_sstable sstable;
    };

@@ -431,8 +439,15 @@ private:
    // but for correct compaction we need to start the compaction only after
    // reading all sstables.
    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
+    // sstables that should not be compacted (e.g. because they need to be used
+    // to generate view updates later)
+    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_staging;
    // Control background fibers waiting for sstables to be deleted
    seastar::gate _sstable_deletion_gate;
+    // This semaphore ensures that an operation like snapshot won't have its selected
+    // sstables deleted by compaction in parallel, a race condition which could
+    // easily result in failure.
+    seastar::semaphore _sstable_deletion_sem = {1};
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
    rwlock _sstables_lock;
@@ -482,6 +497,13 @@ private:
    utils::phased_barrier _pending_writes_phaser;
    // Corresponding phaser for in-progress reads.
    utils::phased_barrier _pending_reads_phaser;
+public:
+    future<> add_sstable_and_update_cache(sstables::shared_sstable sst);
+    void move_sstable_from_staging_in_thread(sstables::shared_sstable sst);
+    sstables::shared_sstable get_staging_sstable(uint64_t generation) {
+        auto it = _sstables_staging.find(generation);
+        return it != _sstables_staging.end() ? it->second : nullptr;
+    }
 private:
    void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
    // Adds new sstable to the set of sstables
@@ -534,7 +556,7 @@ private:
    void rebuild_statistics();

    // This function replaces new sstables by their ancestors, which are sstables that needed resharding.
-    void replace_ancestors_needed_rewrite(std::vector<sstables::shared_sstable> new_sstables);
+    void replace_ancestors_needed_rewrite(std::unordered_set<uint64_t> ancestors, std::vector<sstables::shared_sstable> new_sstables);
    void remove_ancestors_needed_rewrite(std::unordered_set<uint64_t> ancestors);
 private:
    mutation_source_opt _virtual_reader;
@@ -615,6 +637,14 @@ public:
            tracing::trace_state_ptr trace_state = nullptr,
            streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
+    flat_mutation_reader make_reader_excluding_sstable(schema_ptr schema,
+            sstables::shared_sstable sst,
+            const dht::partition_range& range,
+            const query::partition_slice& slice,
+            const io_priority_class& pc = default_priority_class(),
+            tracing::trace_state_ptr trace_state = nullptr,
+            streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
+            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;

    flat_mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
        auto& full_slice = schema->full_slice();
@@ -629,7 +659,13 @@ public:
    flat_mutation_reader make_streaming_reader(schema_ptr schema,
            const dht::partition_range_vector& ranges) const;

+    sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
+    sstables::shared_sstable make_streaming_staging_sstable() {
+        return make_streaming_sstable_for_write("staging");
+    }
+
    mutation_source as_mutation_source() const;
+    mutation_source as_mutation_source_excluding(sstables::shared_sstable sst) const;

    void set_virtual_reader(mutation_source virtual_reader) {
        _virtual_reader = std::move(virtual_reader);
@@ -683,7 +719,7 @@ public:
        query::result_memory_limiter& memory_limiter,
        uint64_t max_result_size,
        db::timeout_clock::time_point timeout = db::no_timeout,
-        querier_cache_context cache_ctx = { });
+        query::querier_cache_context cache_ctx = { });

    void start();
    future<> stop();
@@ -837,6 +873,8 @@ public:
    void clear_views();
    const std::vector<view_ptr>& views() const;
    future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const;
+    future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout) const;
+    future<row_locker::lock_holder> stream_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, sstables::shared_sstable excluded_sstable) const;
    void add_coordinator_read_latency(utils::estimated_histogram::duration latency);
    std::chrono::milliseconds get_coordinator_read_latency_percentile(double percentile);

@@ -854,13 +892,17 @@ public:
            dht::token base_token,
            flat_mutation_reader&&);

+    reader_concurrency_semaphore& read_concurrency_semaphore() {
+        return *_config.read_concurrency_semaphore;
+    }
+
 private:
+    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
            std::vector<view_ptr>&& views,
            mutation&& m,
-            flat_mutation_reader_opt existings,
-            db::timeout_clock::time_point timeout) const;
+            flat_mutation_reader_opt existings) const;

    mutable row_locker _row_locker;
    future<row_locker::lock_holder> local_base_lock(
@@ -1029,6 +1071,7 @@ public:
 class keyspace {
 public:
    struct config {
+        std::vector<sstring> all_datadirs;
        sstring datadir;
        bool enable_commitlog = true;
        bool enable_disk_reads = true;
@@ -1049,6 +1092,7 @@ public:
        seastar::scheduling_group streaming_scheduling_group;
        bool enable_metrics_reporting = false;
        db::timeout_semaphore* view_update_concurrency_semaphore = nullptr;
+        size_t view_update_concurrency_semaphore_limit;
    };
 private:
    std::unique_ptr<locator::abstract_replication_strategy> _replication_strategy;
@@ -1106,6 +1150,7 @@ public:
        return _config.datadir;
    }

+    sstring column_family_directory(const sstring& base_path, const sstring& name, utils::UUID uuid) const;
    sstring column_family_directory(const sstring& name, utils::UUID uuid) const;
 };

@@ -1149,6 +1194,7 @@ private:
    static const size_t max_count_system_concurrent_reads{10};
    size_t max_memory_system_concurrent_reads() { return _dbcfg.available_memory * 0.02; };
    static constexpr size_t max_concurrent_sstable_loads() { return 3; }
+    size_t max_memory_pending_view_updates() const { return _dbcfg.available_memory * 0.1; }

    struct db_stats {
        uint64_t total_writes = 0;
@@ -1160,6 +1206,11 @@ private:

        uint64_t short_data_queries = 0;
        uint64_t short_mutation_queries = 0;
+
+        uint64_t multishard_query_unpopped_fragments = 0;
+        uint64_t multishard_query_unpopped_bytes = 0;
+        uint64_t multishard_query_failed_reader_stops = 0;
+        uint64_t multishard_query_failed_reader_saves = 0;
    };

    lw_shared_ptr<db_stats> _stats;
@@ -1180,11 +1231,11 @@ private:

    semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};

-    db::timeout_semaphore _view_update_concurrency_sem{100}; // Stand-in hack for #2538
+    db::timeout_semaphore _view_update_concurrency_sem{max_memory_pending_view_updates()};

    cache_tracker _row_cache_tracker;

-    concrete_execution_stage<future<lw_shared_ptr<query::result>>,
+    inheriting_concrete_execution_stage<future<lw_shared_ptr<query::result>>,
        column_family*,
        schema_ptr,
        const query::read_command&,
@@ -1194,10 +1245,17 @@ private:
        query::result_memory_limiter&,
        uint64_t,
        db::timeout_clock::time_point,
-        querier_cache_context> _data_query_stage;
+        query::querier_cache_context> _data_query_stage;

    mutation_query_stage _mutation_query_stage;

+    inheriting_concrete_execution_stage<
+            future<>,
+            database*,
+            schema_ptr,
+            const frozen_mutation&,
+            db::timeout_clock::time_point> _apply_stage;
+
    std::unordered_map<sstring, keyspace> _keyspaces;
    std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
    std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash> _ks_cf_to_uuid;
@@ -1208,7 +1266,7 @@ private:
    seastar::metrics::metric_groups _metrics;
    bool _enable_incremental_backups = false;

-    querier_cache _querier_cache;
+    query::querier_cache _querier_cache;

    std::unique_ptr<db::large_partition_handler> _large_partition_handler;

@@ -1380,6 +1438,12 @@ public:
    std::unordered_set<sstring> get_initial_tokens();
    std::experimental::optional<gms::inet_address> get_replace_address();
    bool is_replacing();
+    reader_concurrency_semaphore& user_read_concurrency_sem() {
+        return _read_concurrency_sem;
+    }
+    reader_concurrency_semaphore& streaming_read_concurrency_sem() {
+        return _streaming_concurrency_sem;
+    }
    reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
        return _system_read_concurrency_sem;
    }
@@ -1396,15 +1460,25 @@ public:
        _querier_cache.set_entry_ttl(entry_ttl);
    }

-    const querier_cache::stats& get_querier_cache_stats() const {
+    const query::querier_cache::stats& get_querier_cache_stats() const {
        return _querier_cache.get_stats();
    }

+    query::querier_cache& get_querier_cache() {
+        return _querier_cache;
+    }
+
+    db::view::update_backlog get_view_update_backlog() const {
+        return {max_memory_pending_view_updates() - _view_update_concurrency_sem.current(), max_memory_pending_view_updates()};
+    }
+
    friend class distributed_loader;
 };

 future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy);

+bool is_internal_keyspace(const sstring& name);
+
 class distributed_loader {
 public:
    static void reshard(distributed<database>& db, sstring ks_name, sstring cf_name);
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -76,8 +76,7 @@ const uint32_t db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

 db::batchlog_manager::batchlog_manager(cql3::query_processor& qp)
-        : _qp(qp)
-        , _e1(_rd()) {
+        : _qp(qp) {
    namespace sm = seastar::metrics;

    _metrics.add_group("batchlog_manager", {
@@ -117,7 +116,7 @@ future<> db::batchlog_manager::start() {
    // round-robin scheduling.
    if (engine().cpu_id() == 0) {
        _timer.set_callback([this] {
-            return do_batch_log_replay().handle_exception([] (auto ep) {
+            do_batch_log_replay().handle_exception([] (auto ep) {
                blogger.error("Exception in batch replay: {}", ep);
            }).finally([this] {
                _timer.arm(lowres_clock::now() + std::chrono::milliseconds(replay_interval));
@@ -268,7 +267,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
                // send to partially or wholly fail in actually sending stuff. Since we don't
                // have hints (yet), send with CL=ALL, and hope we can re-do this soon.
                // See below, we use retry on write failure.
-                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, nullptr);
+                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, db::no_timeout, nullptr);
            });
        }).then_wrapped([this, id](future<> batch_result) {
            try {
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -75,8 +75,7 @@ private:
    unsigned _cpu = 0;
    bool _stop = false;

-    std::random_device _rd;
-    std::default_random_engine _e1;
+    std::default_random_engine _e1{std::random_device{}()};

    future<> replay_all_failed_batches();
 public:
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -107,6 +107,11 @@ public:
    void process_bytes(const char* data, size_t size) {
        return _c.process(reinterpret_cast<const uint8_t*>(data), size);
    }
+    template<typename FragmentedBuffer>
+    GCC6_CONCEPT(requires FragmentRange<FragmentedBuffer>)
+    void process_fragmented(const FragmentedBuffer& buffer) {
+        return _c.process_fragmented(buffer);
+    }
 };

 class db::cf_holder {
@@ -308,10 +313,9 @@ public:
    uint64_t get_num_dirty_segments() const;
    uint64_t get_num_active_segments() const;

-    using buffer_type = temporary_buffer<char>;
+    using buffer_type = fragmented_temporary_buffer;

    buffer_type acquire_buffer(size_t s);
-    void release_buffer(buffer_type&&);

    future<std::vector<descriptor>> list_descriptors(sstring dir);

@@ -333,7 +337,6 @@ private:
    segment_id_type _ids = 0;
    std::vector<sseg_ptr> _segments;
    queue<sseg_ptr> _reserve_segments;
-    std::vector<buffer_type> _temp_buffers;
    std::unordered_map<flush_handler_id, flush_handler> _flush_handlers;
    flush_handler_id _flush_ids = 0;
    replay_position _flush_position;
@@ -344,6 +347,12 @@ private:
    uint64_t _new_counter = 0;
 };

+template<typename T, typename Output>
+static void write(Output& out, T value) {
+    auto v = net::hton(value);
+    out.write(reinterpret_cast<const char*>(&v), sizeof(v));
+}
+
 /*
 * A single commit log file on disk. Manages creation of the file and writing mutations to disk,
 * as well as tracking the last mutation position of any "dirty" CFs covered by the segment file. Segment
@@ -398,7 +407,6 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c

    uint64_t _file_pos = 0;
    uint64_t _flush_pos = 0;
-    uint64_t _buf_pos = 0;
    bool _closed = false;

    using buffer_type = segment_manager::buffer_type;
@@ -407,6 +415,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    using time_point = segment_manager::time_point;

    buffer_type _buffer;
+    fragmented_temporary_buffer::ostream _buffer_ostream;
    std::unordered_map<cf_id_type, uint64_t> _cf_dirty;
    time_point _sync_time;
    seastar::gate _gate;
@@ -420,6 +429,10 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    friend std::ostream& operator<<(std::ostream&, const segment&);
    friend class segment_manager;

+    size_t buffer_position() const {
+        return _buffer.size_bytes() - _buffer_ostream.size();
+    }
+
    future<> begin_flush() {
        // This is maintaining the semantica of only using the write-lock
        // as a gate for flushing, i.e. once we've begun a flush for position X
@@ -466,7 +479,7 @@ public:
            clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
            ++_segment_manager->totals.segments_destroyed;
            _segment_manager->totals.total_size_on_disk -= size_on_disk();
-            _segment_manager->totals.total_size -= (size_on_disk() + _buffer.size());
+            _segment_manager->totals.total_size -= (size_on_disk() + _buffer.size_bytes());
            _segment_manager->add_file_to_delete(_file_name, _desc);
        } else {
            clogger.warn("Segment {} is dirty and is left on disk.", *this);
@@ -607,29 +620,16 @@ public:
        auto a = align_up(s + overhead, alignment);
        auto k = std::max(a, default_size);

-        for (;;) {
-            try {
-                _buffer = _segment_manager->acquire_buffer(k);
-                break;
-            } catch (std::bad_alloc&) {
-                clogger.warn("Could not allocate {} k bytes output buffer ({} k required)", k / 1024, a / 1024);
-                if (k > a) {
-                    k = std::max(a, k / 2);
-                    clogger.debug("Trying reduced size: {} k", k / 1024);
-                    continue;
-                }
-                throw;
-            }
-        }
-        _buf_pos = overhead;
-        auto * p = reinterpret_cast<uint32_t *>(_buffer.get_write());
-        std::fill(p, p + overhead, 0);
+        _buffer = _segment_manager->acquire_buffer(k);
+        _buffer_ostream = _buffer.get_ostream();
+        auto out = _buffer_ostream.write_substream(overhead);
+        out.fill('\0', overhead);
        _segment_manager->totals.total_size += k;
    }

    bool buffer_is_empty() const {
-        return _buf_pos <= segment_overhead_size
-                        || (_file_pos == 0 && _buf_pos <= (segment_overhead_size + descriptor_header_size));
+        return buffer_position() <= segment_overhead_size
+                        || (_file_pos == 0 && buffer_position() <= (segment_overhead_size + descriptor_header_size));
    }
    /**
     * Send any buffer contents to disk and get a new tmp buffer
@@ -641,35 +641,32 @@ public:
        }

        auto size = clear_buffer_slack();
-        auto buf = std::move(_buffer);
+        auto buf = std::exchange(_buffer, { });
        auto off = _file_pos;
        auto top = off + size;
        auto num = _num_allocs;

        _file_pos = top;
-        _buf_pos = 0;
+        _buffer_ostream = { };
        _num_allocs = 0;

        auto me = shared_from_this();
        assert(me.use_count() > 1);

-        auto * p = buf.get_write();
-        assert(std::count(p, p + 2 * sizeof(uint32_t), 0) == 2 * sizeof(uint32_t));
-
-        data_output out(p, p + buf.size());
+        auto out = buf.get_ostream();

        auto header_size = 0;

        if (off == 0) {
            // first block. write file header.
-            out.write(segment_magic);
-            out.write(_desc.ver);
-            out.write(_desc.id);
+            write(out, segment_magic);
+            write(out, _desc.ver);
+            write(out, _desc.id);
            crc32_nbo crc;
            crc.process(_desc.ver);
            crc.process<int32_t>(_desc.id & 0xffffffff);
            crc.process<int32_t>(_desc.id >> 32);
-            out.write(crc.checksum());
+            write(out, crc.checksum());
            header_size = descriptor_header_size;
        }

@@ -679,8 +676,8 @@ public:
        crc.process<int32_t>(_desc.id >> 32);
        crc.process(uint32_t(off + header_size));

-        out.write(uint32_t(_file_pos));
-        out.write(crc.checksum());
+        write(out, uint32_t(_file_pos));
+        write(out, crc.checksum());

        forget_schema_versions();

@@ -690,25 +687,32 @@ public:

        // The write will be allowed to start now, but flush (below) must wait for not only this,
        // but all previous write/flush pairs.
-        return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable {
-                auto written = make_lw_shared<size_t>(0);
-                auto p = buf.get();
-                return repeat([this, size, off, written, p]() mutable {
+        return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable { ///////////////////////////////////////////////////
+            auto view = fragmented_temporary_buffer::view(buf);
+            view.remove_suffix(buf.size_bytes() - size);
+            assert(size == view.size_bytes());
+            return do_with(off, view, [&] (uint64_t& off, fragmented_temporary_buffer::view& view) {
+                if (view.empty()) {
+                    return make_ready_future<>();
+                }
+                return repeat([this, size, &off, &view] {
                    auto&& priority_class = service::get_local_commitlog_priority();
-                    return _file.dma_write(off + *written, p + *written, size - *written, priority_class).then_wrapped([this, size, written](future<size_t>&& f) {
+                    auto current = *view.begin();
+                    return _file.dma_write(off, current.data(), current.size(), priority_class).then_wrapped([this, size, &off, &view](future<size_t>&& f) {
                        try {
                            auto bytes = std::get<0>(f.get());
-                            *written += bytes;
                            _segment_manager->totals.bytes_written += bytes;
                            _segment_manager->totals.total_size_on_disk += bytes;
                            ++_segment_manager->totals.cycle_count;
-                            if (*written == size) {
+                            if (bytes == view.size_bytes()) {
                                return make_ready_future<stop_iteration>(stop_iteration::yes);
                            }
                            // gah, partial write. should always get here with dma chunk sized
                            // "bytes", but lets make sure...
-                            clogger.debug("Partial write {}: {}/{} bytes", *this, *written, size);
-                            *written = align_down(*written, alignment);
+                            bytes = align_down(bytes, alignment);
+                            off += bytes;
+                            view.remove_prefix(bytes);
+                            clogger.debug("Partial write {}: {}/{} bytes", *this, size - view.size_bytes(), size);
                            return make_ready_future<stop_iteration>(stop_iteration::no);
                            // TODO: retry/ignore/fail/stop - optional behaviour in origin.
                            // we fast-fail the whole commit.
@@ -717,10 +721,10 @@ public:
                            throw;
                        }
                    });
-                }).finally([this, buf = std::move(buf), size]() mutable {
-                    _segment_manager->release_buffer(std::move(buf));
-                    _segment_manager->notify_memory_written(size);
                });
+            }).finally([this, buf = std::move(buf), size] {
+                    _segment_manager->notify_memory_written(size);
+            });
        }, [me, flush_after, top, rp] { // lambda instead of bind, so we keep "me" alive.
            assert(me->_pending_ops.has_operation(rp));
            return flush_after ? me->do_flush(top) : make_ready_future<sseg_ptr>(me);
@@ -786,7 +790,7 @@ public:
            return finish_and_get_new(timeout).then([id, writer = std::move(writer), permit = std::move(permit), timeout] (auto new_seg) mutable {
                return new_seg->allocate(id, std::move(writer), std::move(permit), timeout);
            });
-        } else if (!_buffer.empty() && (s > (_buffer.size() - _buf_pos))) {  // enough data?
+        } else if (!_buffer.empty() && (s > _buffer_ostream.size())) {  // enough data?
            if (_segment_manager->cfg.mode == sync_mode::BATCH) {
                // TODO: this could cause starvation if we're really unlucky.
                // If we run batch mode and find ourselves not fit in a non-empty
@@ -805,7 +809,7 @@ public:
        size_t buf_memory = s;
        if (_buffer.empty()) {
            new_buffer(s);
-            buf_memory += _buf_pos;
+            buf_memory += buffer_position();
        }

        _gate.enter(); // this might throw. I guess we accept this?
@@ -813,29 +817,26 @@ public:
        _segment_manager->account_memory_usage(buf_memory);

        replay_position rp(_desc.id, position());
-        auto pos = _buf_pos;
-        _buf_pos += s;
        _cf_dirty[id]++; // increase use count for cf.

        rp_handle h(static_pointer_cast<cf_holder>(shared_from_this()), std::move(id), rp);

-        auto * p = _buffer.get_write() + pos;
-        auto * e = _buffer.get_write() + pos + s - sizeof(uint32_t);
-
-        data_output out(p, e);
+        auto out = _buffer_ostream.write_substream(s);
        crc32_nbo crc;

-        out.write(uint32_t(s));
+        write<uint32_t>(out, s);
        crc.process(uint32_t(s));
-        out.write(crc.checksum());
+        write<uint32_t>(out, crc.checksum());

        // actual data
-        writer->write(*this, out);
+        auto entry_out = out.write_substream(size);
+        auto entry_data = entry_out.to_input_stream();
+        writer->write(*this, entry_out);
+        entry_data.with_stream([&] (auto data_str) {
+            crc.process_fragmented(ser::buffer_view<typename std::vector<temporary_buffer<char>>::iterator>(data_str));
+        });

-        crc.process_bytes(p + 2 * sizeof(uint32_t), size);
-
-        out = data_output(e, sizeof(uint32_t));
-        out.write(crc.checksum());
+        write<uint32_t>(out, crc.checksum());

        ++_segment_manager->totals.allocation_count;
        ++_num_allocs;
@@ -850,7 +851,7 @@ public:
            // If this buffer alone is too big, potentially bigger than the maximum allowed size,
            // then no other request will be allowed in to force the cycle()ing of this buffer. We
            // have to do it ourselves.
-            if ((_buf_pos >= (db::commitlog::segment::default_size))) {
+            if ((buffer_position() >= (db::commitlog::segment::default_size))) {
                cycle().discard_result().handle_exception([] (auto ex) {
                    clogger.error("Failed to flush commits to disk: {}", ex);
                });
@@ -860,7 +861,7 @@ public:
    }

    position_type position() const {
-        return position_type(_file_pos + _buf_pos);
+        return position_type(_file_pos + buffer_position());
    }

    size_t size_on_disk() const {
@@ -870,11 +871,12 @@ public:
    // ensures no more of this segment is writeable, by allocating any unused section at the end and marking it discarded
    // a.k.a. zero the tail.
    size_t clear_buffer_slack() {
-        auto size = align_up(_buf_pos, alignment);
-        std::fill(_buffer.get_write() + _buf_pos, _buffer.get_write() + size,
-                0);
-        _segment_manager->totals.bytes_slack += (size - _buf_pos);
-        _segment_manager->account_memory_usage(size - _buf_pos);
+        auto buf_pos = buffer_position();
+        auto size = align_up(buf_pos, alignment);
+        auto fill_size = size - buf_pos;
+        _buffer_ostream.fill('\0', fill_size);
+        _segment_manager->totals.bytes_slack += fill_size;
+        _segment_manager->account_memory_usage(fill_size);
        return size;
    }
    void mark_clean(const cf_id_type& id, uint64_t count) {
@@ -1514,41 +1516,20 @@ uint64_t db::commitlog::segment_manager::get_num_active_segments() const {


 db::commitlog::segment_manager::buffer_type db::commitlog::segment_manager::acquire_buffer(size_t s) {
-    auto i = _temp_buffers.begin();
-    auto e = _temp_buffers.end();
+    s = align_up(s, segment::default_size);
+    auto fragment_count = s / segment::default_size;

-    while (i != e) {
-        if (i->size() >= s) {
-            auto r = std::move(*i);
-            _temp_buffers.erase(i);
-            totals.buffer_list_bytes -= r.size();
-            return r;
+    std::vector<temporary_buffer<char>> buffers;
+    buffers.reserve(fragment_count);
+    while (buffers.size() < fragment_count) {
+        auto a = ::memalign(segment::alignment, segment::default_size);
+        if (a == nullptr) {
+            throw std::bad_alloc();
        }
-        ++i;
-    }
-    auto a = ::memalign(segment::alignment, s);
-    if (a == nullptr) {
-        throw std::bad_alloc();
+        buffers.emplace_back(static_cast<char*>(a), segment::default_size, make_free_deleter(a));
    }
    clogger.trace("Allocated {} k buffer", s / 1024);
-    return buffer_type(reinterpret_cast<char *>(a), s, make_free_deleter(a));
-}
-
-void db::commitlog::segment_manager::release_buffer(buffer_type&& b) {
-    _temp_buffers.emplace_back(std::move(b));
-    std::sort(_temp_buffers.begin(), _temp_buffers.end(), [](const buffer_type& b1, const buffer_type& b2) {
-        return b1.size() < b2.size();
-    });
-
-    constexpr const size_t max_temp_buffers = 4;
-
-    if (_temp_buffers.size() > max_temp_buffers) {
-        clogger.trace("Deleting {} buffers", _temp_buffers.size() - max_temp_buffers);
-        _temp_buffers.erase(_temp_buffers.begin() + max_temp_buffers, _temp_buffers.end());
-    }
-    totals.buffer_list_bytes = boost::accumulate(
-	    _temp_buffers | boost::adaptors::transformed(std::mem_fn(&buffer_type::size)),
-            size_t(0), std::plus<size_t>());
+    return fragmented_temporary_buffer(std::move(buffers), s);
 }

 /**
@@ -1694,14 +1675,14 @@ const db::commitlog::config& db::commitlog::active_config() const {
 // No commit_io_check needed in the log reader since the database will fail
 // on error at startup if required
 future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
-db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off, const db::extensions* exts) {
+db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class read_io_prio_class, commit_load_reader_func next, position_type off, const db::extensions* exts) {
    struct work {
    private:
-        file_input_stream_options make_file_input_stream_options() {
+        file_input_stream_options make_file_input_stream_options(seastar::io_priority_class read_io_prio_class) {
            file_input_stream_options fo;
            fo.buffer_size = db::commitlog::segment::default_size;
            fo.read_ahead = 10;
-            fo.io_priority_class = service::get_local_commitlog_priority();
+            fo.io_priority_class = read_io_prio_class;
            return fo;
        }
    public:
@@ -1720,8 +1701,8 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        bool header = true;
        bool failed = false;

-        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
+        work(file f, seastar::io_priority_class read_io_prio_class, position_type o = 0)
+                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options(read_io_prio_class))), start_off(o) {
        }
        work(work&&) = default;

@@ -1939,9 +1920,9 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        return fut;
    });

-    return fut.then([off, next](file f) {
+    return fut.then([off, next, read_io_prio_class] (file f) {
        f = make_checked_file(commit_error_handler, std::move(f));
-        auto w = make_lw_shared<work>(std::move(f), off);
+        auto w = make_lw_shared<work>(std::move(f), read_io_prio_class, off);
        auto ret = w->s.listen(next);

        w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -42,7 +42,6 @@

 #include <memory>

-#include "utils/data_output.hh"
 #include "core/future.hh"
 #include "core/shared_ptr.hh"
 #include "core/stream.hh"
@@ -176,7 +175,7 @@ public:
     * of data to be written. (See add).
     * Don't write less, absolutely don't write more...
     */
-    using output = data_output;
+    using output = fragmented_temporary_buffer::ostream;
    using serializer_func = std::function<void(output&)>;

    /**
@@ -356,7 +355,7 @@ public:
    };

    static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
-            const sstring&, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
+            const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
 private:
    commitlog(config);

--- a/db/commitlog/commitlog_entry.cc
+++ b/db/commitlog/commitlog_entry.cc
@@ -51,9 +51,8 @@ void commitlog_entry_writer::compute_size() {
    _size = ms.size();
 }

-void commitlog_entry_writer::write(data_output& out) const {
-    seastar::simple_output_stream str(out.reserve(size()), size());
-    serialize(str);
+void commitlog_entry_writer::write(typename seastar::memory_output_stream<std::vector<temporary_buffer<char>>::iterator>& out) const {
+    serialize(out);
 }

 commitlog_entry_reader::commitlog_entry_reader(const temporary_buffer<char>& buffer)
--- a/db/commitlog/commitlog_entry.hh
+++ b/db/commitlog/commitlog_entry.hh
@@ -25,7 +25,6 @@

 #include "frozen_mutation.hh"
 #include "schema.hh"
-#include "utils/data_output.hh"
 #include "stdx.hh"

 class commitlog_entry {
@@ -35,7 +34,8 @@ public:
    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
        : _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
    const stdx::optional<column_mapping>& mapping() const { return _mapping; }
-    const frozen_mutation& mutation() const { return _mutation; }
+    const frozen_mutation& mutation() const & { return _mutation; }
+    frozen_mutation&& mutation() && { return std::move(_mutation); }
 };

 class commitlog_entry_writer {
@@ -72,7 +72,7 @@ public:
        return _mutation.representation().size();
    }

-    void write(data_output& out) const;
+    void write(typename seastar::memory_output_stream<std::vector<temporary_buffer<char>>::iterator>& out) const;
 };

 class commitlog_entry_reader {
@@ -81,5 +81,6 @@ public:
    commitlog_entry_reader(const temporary_buffer<char>& buffer);

    const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
-    const frozen_mutation& mutation() const { return _ce.mutation(); }
+    const frozen_mutation& mutation() const & { return _ce.mutation(); }
+    frozen_mutation&& mutation() && { return std::move(_ce).mutation(); }
 };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -58,6 +58,7 @@
 #include "converting_mutation_partition_applier.hh"
 #include "schema_registry.hh"
 #include "commitlog_entry.hh"
+#include "service/priority_manager.hh"

 static logging::logger rlogger("commitlog_replayer");

@@ -163,7 +164,7 @@ future<> db::commitlog_replayer::impl::init() {
                // Get all truncation records for the CF and initialize max rps if
                // present. Cannot do this on demand, as there may be no sstables to
                // mark the CF as "needed".
-                return db::system_keyspace::get_truncated_position(uuid).then([&map, &uuid](std::vector<db::replay_position> tpps) {
+                return db::system_keyspace::get_truncated_position(uuid).then([&map, uuid](std::vector<db::replay_position> tpps) {
                    for (auto& p : tpps) {
                        rlogger.trace("CF {} truncated at {}", uuid, p);
                        auto& pp = map[p.shard_id()][uuid];
@@ -223,7 +224,7 @@ db::commitlog_replayer::impl::recover(sstring file, const sstring& fname_prefix)
    auto s = make_lw_shared<stats>();
    auto& exts = _qp.local().db().local().get_config().extensions();

-    return db::commitlog::read_log_file(file,
+    return db::commitlog::read_log_file(file, service::get_local_commitlog_priority(),
            std::bind(&impl::process, this, s.get(), std::placeholders::_1,
                    std::placeholders::_2), p, &exts).then([](auto s) {
        auto f = s->done();
--- a/db/config.cc
+++ b/db/config.cc
@@ -102,6 +102,8 @@ db::config::config()
 db::config::~config()
 {}

+const sstring db::config::default_tls_priority("SECURE128:-VERS-TLS1.0");
+
 namespace utils {

 template<>
--- a/db/config.hh
+++ b/db/config.hh
@@ -155,6 +155,9 @@ public:
    val(hints_directory, sstring, "/var/lib/scylla/hints", Used,   \
            "The directory where hints files are stored if hinted handoff is enabled."   \
    )                                           \
+    val(view_hints_directory, sstring, "/var/lib/scylla/view_hints", Used,   \
+            "The directory where materialized-view updates are stored while a view replica is unreachable."   \
+    )                                           \
    val(saved_caches_directory, sstring, "/var/lib/scylla/saved_caches", Unused, \
            "The directory location where table key and row caches are stored."  \
    )                                                   \
@@ -453,7 +456,7 @@ public:
            "The maximum number of tombstones a query can scan before aborting."  \
    )   \
    /* Network timeout settings */  \
-    val(range_request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(range_request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The time in milliseconds that the coordinator waits for sequential or index scans to complete."  \
    )   \
    val(read_request_timeout_in_ms, uint32_t, 5000, Used,     \
@@ -472,7 +475,7 @@ public:
            "The time in milliseconds that the coordinator waits for write operations to complete.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
-    val(request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The default timeout for other, miscellaneous operations.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
@@ -578,8 +581,8 @@ public:
    val(dynamic_snitch_update_interval_in_ms, uint32_t, 100, Unused,     \
            "The time interval for how often the snitch calculates node scores. Because score calculation is CPU intensive, be careful when reducing this interval."  \
    )   \
-    val(hinted_handoff_enabled, sstring, "false", Used,     \
-            "Experimental: enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
+    val(hinted_handoff_enabled, sstring, "true", Used,     \
+            "Enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
            "Related information: About hinted handoff writes"  \
    )   \
    val(hinted_handoff_throttle_in_kb, uint32_t, 1024, Unused,     \
@@ -621,7 +624,7 @@ public:
    val(thrift_framed_transport_size_in_mb, uint32_t, 15, Unused,     \
            "Frame size (maximum field length) for Thrift. The frame is the row or part of the row the application is inserting."  \
    )   \
-    val(thrift_max_message_length_in_mb, uint32_t, 16, Unused,     \
+    val(thrift_max_message_length_in_mb, uint32_t, 16, Used,     \
            "The maximum length of a Thrift message in megabytes, including all fields and internal Thrift overhead (1 byte of overhead for each frame). Message length is usually used in conjunction with batches. A frame length greater than or equal to 24 accommodates a batch with four inserts, each of which is 24 bytes. The required message length is greater than or equal to 24+24+24+24+4 (number of frames)."  \
    )   \
    /* Security properties */   \
@@ -728,7 +731,7 @@ public:
    val(prometheus_address, sstring, "0.0.0.0", Used, "Prometheus listening address") \
    val(prometheus_prefix, sstring, "scylla", Used, "Set the prefix of the exported Prometheus metrics. Changing this will break Scylla's dashboard compatibility, do not change unless you know what you are doing.") \
    val(abort_on_lsa_bad_alloc, bool, false, Used, "Abort when allocation in LSA region fails") \
-    val(murmur3_partitioner_ignore_msb_bits, unsigned, 0, Used, "Number of most siginificant token bits to ignore in murmur3 partitioner; increase for very large clusters") \
+    val(murmur3_partitioner_ignore_msb_bits, unsigned, 12, Used, "Number of most siginificant token bits to ignore in murmur3 partitioner; increase for very large clusters") \
    val(virtual_dirty_soft_limit, double, 0.6, Used, "Soft limit of virtual dirty memory expressed as a portion of the hard limit") \
    val(sstable_summary_ratio, double, 0.0005, Used, "Enforces that 1 byte of summary is written for every N (2000 by default) " \
        "bytes written to data file. Value must be between 0 and 1.") \
@@ -739,6 +742,7 @@ public:
        " Performance is affected to some extent as a result. Useful to help debugging problems that may arise at another layers.") \
    val(cpu_scheduler, bool, true, Used, "Enable cpu scheduling") \
    val(view_building, bool, true, Used, "Enable view building; should only be set to false when the node is experience issues due to view building") \
+    val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
@@ -752,6 +756,8 @@ public:
    add_options(boost::program_options::options_description_easy_init&);

    const db::extensions& extensions() const;
+
+    static const sstring default_tls_priority;
 private:
    template<typename T>
    struct log_legacy_value : public named_value<T, value_status::Used> {
--- a/db/consistency_level.cc
+++ b/db/consistency_level.cc
@@ -253,8 +253,12 @@ filter_for_query(consistency_level cl,
    return selected_endpoints;
 }

-std::vector<gms::inet_address> filter_for_query(consistency_level cl, keyspace& ks, std::vector<gms::inet_address>& live_endpoints, column_family* cf) {
-    return filter_for_query(cl, ks, live_endpoints, {}, read_repair_decision::NONE, nullptr, cf);
+std::vector<gms::inet_address> filter_for_query(consistency_level cl,
+        keyspace& ks,
+        std::vector<gms::inet_address>& live_endpoints,
+        const std::vector<gms::inet_address>& preferred_endpoints,
+        column_family* cf) {
+    return filter_for_query(cl, ks, live_endpoints, preferred_endpoints, read_repair_decision::NONE, nullptr, cf);
 }

 bool
--- a/db/consistency_level.hh
+++ b/db/consistency_level.hh
@@ -84,7 +84,11 @@ filter_for_query(consistency_level cl,
                 gms::inet_address* extra,
                 column_family* cf);

-std::vector<gms::inet_address> filter_for_query(consistency_level cl, keyspace& ks, std::vector<gms::inet_address>& live_endpoints, column_family* cf);
+std::vector<gms::inet_address> filter_for_query(consistency_level cl,
+        keyspace& ks,
+        std::vector<gms::inet_address>& live_endpoints,
+        const std::vector<gms::inet_address>& preferred_endpoints,
+        column_family* cf);

 struct dc_node_count {
    size_t live = 0;
--- a/db/cql_type_parser.cc
+++ b/db/cql_type_parser.cc
@@ -49,7 +49,10 @@
 #include "types.hh"

 static ::shared_ptr<cql3::cql3_type::raw> parse_raw(const sstring& str) {
-    return cql3::util::do_with_parser(str,  std::mem_fn(&cql3_parser::CqlParser::comparatorType));
+    return cql3::util::do_with_parser(str,
+        [] (cql3_parser::CqlParser& parser) {
+            return parser.comparator_type(true);
+        });
 }

 data_type db::cql_type_parser::parse(const sstring& keyspace, const sstring& str, lw_shared_ptr<user_types_metadata> user_types) {
--- a/db/heat_load_balance.cc
+++ b/db/heat_load_balance.cc
@@ -28,8 +28,7 @@ logging::logger hr_logger("heat_load_balance");
 // Return a uniformly-distributed random number in [0,1)
 // We use per-thread state for thread safety.  We seed the random number generator
 // once with a real random value, if available,
-static thread_local std::random_device r;
-static thread_local std::default_random_engine random_engine(r());
+static thread_local std::default_random_engine random_engine{std::random_device{}()};
 float
 rand_float() {
    static thread_local std::uniform_real_distribution<float> u(0, 1);
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -20,9 +20,11 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <algorithm>
 #include <seastar/core/future.hh>
 #include <seastar/core/seastar.hh>
 #include <seastar/core/gate.hh>
+#include <boost/range/adaptors.hpp>
 #include "service/storage_service.hh"
 #include "utils/div_ceil.hh"
 #include "db/config.hh"
@@ -33,6 +35,9 @@
 #include "disk-error-handler.hh"
 #include "lister.hh"
 #include "db/timeout_clock.hh"
+#include "service/priority_manager.hh"
+
+using namespace std::literals::chrono_literals;

 namespace db {
 namespace hints {
@@ -74,6 +79,9 @@ void manager::register_metrics(const sstring& group_name) {

        sm::make_derive("sent", _stats.sent,
                        sm::description("Number of sent hints.")),
+
+        sm::make_derive("discarded", _stats.discarded,
+                        sm::description("Number of hints that were discarded during sending (too old, schema changed, etc.).")),
    });
 }

@@ -91,6 +99,7 @@ future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr
        return compute_hints_dir_device_id();
    }).then([this] {
        _strorage_service_anchor->register_subscriber(this);
+        set_started();
    });
 }

@@ -101,7 +110,7 @@ future<> manager::stop() {
        _strorage_service_anchor->unregister_subscriber(this);
    }

-    _stopping = true;
+    set_stopping();

    return _draining_eps_gate.close().finally([this] {
        return parallel_for_each(_ep_managers, [] (auto& pair) {
@@ -273,7 +282,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
 }

 bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
-    if (_stopping || !can_hint_for(ep)) {
+    if (stopping() || !started() || !can_hint_for(ep)) {
        manager_logger.trace("Can't store a hint to {}", ep);
        ++_stats.dropped;
        return false;
@@ -376,7 +385,7 @@ future<timespec> manager::end_point_hints_manager::sender::get_last_file_modific
    });
 }

-future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
+future<> manager::end_point_hints_manager::sender::do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
    return futurize_apply([this, m = std::move(m), &natural_endpoints] () mutable -> future<> {
        // The fact that we send with CL::ALL in both cases below ensures that new hints are not going
        // to be generated as a result of hints sending.
@@ -385,7 +394,11 @@ future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation
            return _proxy.send_to_endpoint(std::move(m), end_point_key(), { }, write_type::SIMPLE);
        } else {
            manager_logger.trace("Endpoints set has changed and {} is no longer a replica. Mutating from scratch...", end_point_key());
-            return _proxy.mutate({std::move(m)}, consistency_level::ALL, nullptr);
+            // FIXME: using 1h as infinite timeout. If a node is down, we should get an
+            // unavailable exception.
+            auto timeout = db::timeout_clock::now() + 1h;
+            //FIXME: Add required frozen_mutation overloads
+            return _proxy.mutate({m.fm.unfreeze(m.s)}, consistency_level::ALL, timeout, nullptr);
        }
    });
 }
@@ -411,21 +424,19 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {
    }
 }

-mutation manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
+frozen_mutation_and_schema manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
    hint_entry_reader hr(buf);
    auto& fm = hr.mutation();
    auto& cm = get_column_mapping(std::move(ctx_ptr), fm, hr);
-    auto& cf = _db.find_column_family(fm.column_family_id());
+    auto schema = _db.find_schema(fm.column_family_id());

-    if (cf.schema()->version() != fm.schema_version()) {
-        mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
-        converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
+    if (schema->version() != fm.schema_version()) {
+        mutation m(schema, fm.decorated_key(*schema));
+        converting_mutation_partition_applier v(cm, *schema, m.partition());
        fm.partition().accept(cm, v);
-
-        return std::move(m);
-    } else {
-        return fm.unfreeze(cf.schema());
+        return {freeze(m), std::move(schema)};
    }
+    return {std::move(hr).mutation(), std::move(schema)};
 }

 const column_mapping& manager::end_point_hints_manager::sender::get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr) {
@@ -495,7 +506,7 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
 }

 void manager::drain_for(gms::inet_address endpoint) {
-    if (_stopping) {
+    if (stopping()) {
        return;
    }

@@ -536,6 +547,7 @@ manager::end_point_hints_manager::sender::sender(end_point_hints_manager& parent
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(local_storage_proxy)
    , _db(local_db)
+    , _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
    , _gossiper(local_gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -548,6 +560,7 @@ manager::end_point_hints_manager::sender::sender(const sender& other, end_point_
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(other._proxy)
    , _db(other._db)
+    , _hints_cpu_sched_group(other._hints_cpu_sched_group)
    , _gossiper(other._gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -603,7 +616,10 @@ manager::end_point_hints_manager::sender::clock::duration manager::end_point_hin
 }

 void manager::end_point_hints_manager::sender::start() {
-    _stopped = seastar::async([this] {
+    seastar::thread_attributes attr;
+
+    attr.sched_group = _hints_cpu_sched_group;
+    _stopped = seastar::async(std::move(attr), [this] {
        manager_logger.trace("ep_manager({})::sender: started", end_point_key());
        while (!stopping()) {
            try {
@@ -623,10 +639,11 @@ void manager::end_point_hints_manager::sender::start() {
    });
 }

-future<> manager::end_point_hints_manager::sender::send_one_mutation(mutation m) {
-    keyspace& ks = _db.find_keyspace(m.schema()->ks_name());
+future<> manager::end_point_hints_manager::sender::send_one_mutation(frozen_mutation_and_schema m) {
+    keyspace& ks = _db.find_keyspace(m.s->ks_name());
    auto& rs = ks.get_replication_strategy();
-    std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(m.token());
+    auto token = dht::global_partitioner().get_token(*m.s, m.fm.key(*m.s));
+    std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(std::move(token));

    return do_send_one_mutation(std::move(m), natural_endpoints);
 }
@@ -644,8 +661,8 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
                    return make_ready_future<>();
                }

-                mutation m = this->get_mutation(ctx_ptr, buf);
-                gc_clock::duration gc_grace_sec = m.schema()->gc_grace_seconds();
+                auto m = this->get_mutation(ctx_ptr, buf);
+                gc_clock::duration gc_grace_sec = m.s->gc_grace_seconds();

                // The hint is too old - drop it.
                //
@@ -666,10 +683,13 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
            // ignore these errors and move on - probably this hint is too old and the KS/CF has been deleted...
            } catch (no_such_column_family& e) {
                manager_logger.debug("send_hints(): no_such_column_family: {}", e.what());
+                ++this->shard_stats().discarded;
            } catch (no_such_keyspace& e) {
                manager_logger.debug("send_hints(): no_such_keyspace: {}", e.what());
+                ++this->shard_stats().discarded;
            } catch (no_column_mapping& e) {
-                manager_logger.debug("send_hints(): {}: {}", fname, e.what());
+                manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
+                ++this->shard_stats().discarded;
            }
            return make_ready_future<>();
        }).finally([units = std::move(units), ctx_ptr] {});
@@ -683,10 +703,10 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
 bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fname) {
    timespec last_mod = get_last_file_modification(fname).get0();
    gc_clock::duration secs_since_file_mod = std::chrono::seconds(last_mod.tv_sec);
-    lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>();
+    lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>(_last_schema_ver_to_column_mapping);

    try {
-        auto s = commitlog::read_log_file(fname, [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
+        auto s = commitlog::read_log_file(fname, service::get_local_streaming_read_priority(), [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
            // Check that we can still send the next hint. Don't try to send it if the destination host
            // is DOWN or if we have already failed to send some of the previous hints.
            if (!draining() && ctx_ptr->state.contains(send_state::segment_replay_failed)) {
@@ -740,6 +760,7 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam

    // clear the replay position - we are going to send the next segment...
    _last_not_complete_rp = replay_position();
+    _last_schema_ver_to_column_mapping.clear();
    manager_logger.trace("send_one_file(): segment {} was sent in full and deleted", fname);
    return true;
 }
@@ -752,7 +773,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    int replayed_segments_count = 0;

    try {
-        while (have_segments()) {
+        while (replay_allowed() && have_segments()) {
            if (!send_one_file(*_segments_to_replay.begin())) {
                break;
            }
@@ -777,5 +798,175 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
 }

+template<typename Func>
+static future<> scan_for_hints_dirs(const sstring& hints_directory, Func&& f) {
+    return lister::scan_dir(hints_directory, { directory_entry_type::directory }, [f = std::forward<Func>(f)] (lister::path dir, directory_entry de) {
+        try {
+            return f(std::move(dir), std::move(de), std::stoi(de.name.c_str()));
+        } catch (std::invalid_argument& ex) {
+            manager_logger.debug("Ignore invalid directory {}", de.name);
+            return make_ready_future<>();
+        }
+    });
+}
+
+// runs in seastar::async context
+manager::hints_segments_map manager::get_current_hints_segments(const sstring& hints_directory) {
+    hints_segments_map current_hints_segments;
+
+    // shards level
+    scan_for_hints_dirs(hints_directory, [&current_hints_segments] (lister::path dir, directory_entry de, unsigned shard_id) {
+        manager_logger.trace("shard_id = {}", shard_id);
+        // IPs level
+        return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory }, [&current_hints_segments, shard_id] (lister::path dir, directory_entry de) {
+            manager_logger.trace("\tIP: {}", de.name);
+            // hints files
+            return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::regular }, [&current_hints_segments, shard_id, ep_addr = de.name] (lister::path dir, directory_entry de) {
+                manager_logger.trace("\t\tfile: {}", de.name);
+                current_hints_segments[ep_addr][shard_id].emplace_back(dir / de.name.c_str());
+                return make_ready_future<>();
+            });
+        });
+    }).get();
+
+    return current_hints_segments;
+}
+
+// runs in seastar::async context
+void manager::rebalance_segments(const sstring& hints_directory, hints_segments_map& segments_map) {
+    // Count how many hints segments to each destination we have.
+    std::unordered_map<sstring, size_t> per_ep_hints;
+    for (auto& ep_info : segments_map) {
+        per_ep_hints[ep_info.first] = boost::accumulate(ep_info.second | boost::adaptors::map_values | boost::adaptors::transformed(std::mem_fn(&std::list<lister::path>::size)), 0);
+        manager_logger.trace("{}: total files: {}", ep_info.first, per_ep_hints[ep_info.first]);
+    }
+
+    // Create a map of lists of segments that we will move (for each destination end point): if a shard has segments
+    // then we will NOT move q = int(N/S) segments out of them, where N is a total number of segments to the current
+    // destination and S is a current number of shards.
+    std::unordered_map<sstring, std::list<lister::path>> segments_to_move;
+    for (auto& [ep, ep_segments] : segments_map) {
+        size_t q = per_ep_hints[ep] / smp::count;
+        auto& current_segments_to_move = segments_to_move[ep];
+
+        for (auto& [shard_id, shard_segments] : ep_segments) {
+            // Move all segments from the shards that are no longer relevant (re-sharding to the lower number of shards)
+            if (shard_id >= smp::count) {
+                current_segments_to_move.splice(current_segments_to_move.end(), shard_segments);
+            } else if (shard_segments.size() > q) {
+                current_segments_to_move.splice(current_segments_to_move.end(), shard_segments, std::next(shard_segments.begin(), q), shard_segments.end());
+            }
+        }
+    }
+
+    // Since N (a total number of segments to a specific destination) may be not a multiple of S (a current number of
+    // shards) we will distribute files in two passes:
+    //    * if N = S * q + r, then
+    //       * one pass for segments_per_shard = q
+    //       * another one for segments_per_shard = q + 1.
+    //
+    // This way we will ensure as close to the perfect distribution as possible.
+    //
+    // Right till this point we haven't moved any segments. However we have created a logical separation of segments
+    // into two groups:
+    //    * Segments that are not going to be moved: segments in the segments_map.
+    //    * Segments that are going to be moved: segments in the segments_to_move.
+    //
+    // rebalance_segments_for() is going to consume segments from segments_to_move and move them to corresponding
+    // lists in the segments_map AND actually move segments to the corresponding shard's sub-directory till the requested
+    // segments_per_shard level is reached (see more details in the description of rebalance_segments_for()).
+    for (auto& [ep, N] : per_ep_hints) {
+        size_t q = N / smp::count;
+        size_t r = N - q * smp::count;
+        auto& current_segments_to_move = segments_to_move[ep];
+        auto& current_segments_map = segments_map[ep];
+
+        if (q) {
+            rebalance_segments_for(ep, q, hints_directory, current_segments_map, current_segments_to_move);
+        }
+
+        if (r) {
+            rebalance_segments_for(ep, q + 1, hints_directory, current_segments_map, current_segments_to_move);
+        }
+    }
+}
+
+// runs in seastar::async context
+void manager::rebalance_segments_for(
+        const sstring& ep,
+        size_t segments_per_shard,
+        const sstring& hints_directory,
+        hints_ep_segments_map& ep_segments,
+        std::list<lister::path>& segments_to_move)
+{
+    manager_logger.trace("{}: segments_per_shard: {}, total number of segments to move: {}", ep, segments_per_shard, segments_to_move.size());
+
+    // sanity check
+    if (segments_to_move.empty() || !segments_per_shard) {
+        return;
+    }
+
+    for (unsigned i = 0; i < smp::count && !segments_to_move.empty(); ++i) {
+        lister::path shard_path_dir(lister::path(hints_directory.c_str()) / seastar::format("{:d}", i).c_str() / ep.c_str());
+        std::list<lister::path>& current_shard_segments = ep_segments[i];
+
+        // Make sure that the shard_path_dir exists and if not - create it
+        io_check(recursive_touch_directory, shard_path_dir.c_str()).get();
+
+        while (current_shard_segments.size() < segments_per_shard && !segments_to_move.empty()) {
+            auto seg_path_it = segments_to_move.begin();
+            lister::path new_path(shard_path_dir / seg_path_it->filename());
+
+            // Don't move the file to the same location - it's pointless.
+            if (*seg_path_it != new_path) {
+                manager_logger.trace("going to move: {} -> {}", *seg_path_it, new_path);
+                io_check(rename_file, seg_path_it->native(), new_path.native()).get();
+            } else {
+                manager_logger.trace("skipping: {}", *seg_path_it);
+            }
+            current_shard_segments.splice(current_shard_segments.end(), segments_to_move, seg_path_it, std::next(seg_path_it));
+        }
+    }
+}
+
+// runs in seastar::async context
+void manager::remove_irrelevant_shards_directories(const sstring& hints_directory) {
+    // shards level
+    scan_for_hints_dirs(hints_directory, [] (lister::path dir, directory_entry de, unsigned shard_id) {
+        if (shard_id >= smp::count) {
+            // IPs level
+            return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory, directory_entry_type::regular }, lister::show_hidden::yes, [] (lister::path dir, directory_entry de) {
+                return io_check(remove_file, (dir / de.name.c_str()).native());
+            }).then([shard_base_dir = dir, shard_entry = de] {
+                return io_check(remove_file, (shard_base_dir / shard_entry.name.c_str()).native());
+            });
+        }
+        return make_ready_future<>();
+    }).get();
+}
+
+future<> manager::rebalance(sstring hints_directory) {
+    return seastar::async([hints_directory = std::move(hints_directory)] {
+        // Scan currently present hints segments.
+        hints_segments_map current_hints_segments = get_current_hints_segments(hints_directory);
+
+        // Move segments to achieve an even distribution of files among all present shards.
+        rebalance_segments(hints_directory, current_hints_segments);
+
+        // Remove the directories of shards that are not present anymore - they should not have any segments by now
+        remove_irrelevant_shards_directories(hints_directory);
+    });
+}
+
+void manager::update_backlog(size_t backlog, size_t max_backlog) {
+    _backlog_size = backlog;
+    _max_backlog_size = max_backlog;
+    if (backlog < max_backlog) {
+        allow_hints();
+    } else {
+        forbid_hints_for_eps_with_pending_hints();
+    }
+}
+
 }
 }
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -31,6 +31,7 @@
 #include <seastar/core/timer.hh>
 #include <seastar/core/lowres_clock.hh>
 #include <seastar/core/shared_mutex.hh>
+#include "lister.hh"
 #include "gms/gossiper.hh"
 #include "locator/snitch_base.hh"
 #include "service/endpoint_lifecycle_subscriber.hh"
@@ -58,11 +59,19 @@ private:
        uint64_t errors = 0;
        uint64_t dropped = 0;
        uint64_t sent = 0;
+        uint64_t discarded = 0;
    };

+    // map: shard -> segments
+    using hints_ep_segments_map = std::unordered_map<unsigned, std::list<lister::path>>;
+    // map: IP -> map: shard -> segments
+    using hints_segments_map = std::unordered_map<sstring, hints_ep_segments_map>;
+
    class drain_tag {};
    using drain = seastar::bool_class<drain_tag>;

+    friend class space_watchdog;
+
 public:
    class end_point_hints_manager {
    public:
@@ -94,7 +103,10 @@ public:
                send_state::restart_segment>>;

            struct send_one_file_ctx {
-                std::unordered_map<table_schema_version, column_mapping> schema_ver_to_column_mapping;
+                send_one_file_ctx(std::unordered_map<table_schema_version, column_mapping>& last_schema_ver_to_column_mapping)
+                    : schema_ver_to_column_mapping(last_schema_ver_to_column_mapping)
+                {}
+                std::unordered_map<table_schema_version, column_mapping>& schema_ver_to_column_mapping;
                seastar::gate file_send_gate;
                std::unordered_set<db::replay_position> rps_set; // number of elements in this set is never going to be greater than the maximum send queue length
                send_state_set state;
@@ -103,6 +115,7 @@ public:
        private:
            std::list<sstring> _segments_to_replay;
            replay_position _last_not_complete_rp;
+            std::unordered_map<table_schema_version, column_mapping> _last_schema_ver_to_column_mapping;
            state_set _state;
            future<> _stopped;
            clock::time_point _next_flush_tp;
@@ -113,6 +126,7 @@ public:
            resource_manager& _resource_manager;
            service::storage_proxy& _proxy;
            database& _db;
+            seastar::scheduling_group _hints_cpu_sched_group;
            gms::gossiper& _gossiper;
            seastar::shared_mutex& _file_update_mutex;

@@ -173,6 +187,10 @@ public:
                return _state.contains(state::stopping);
            }

+            bool replay_allowed() const noexcept {
+                return _ep_manager.replay_allowed();
+            }
+
            /// \brief Try to send one hint read from the file.
            ///  - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
            ///  - Discard the hints that are older than the grace seconds value of the corresponding table.
@@ -204,7 +222,7 @@ public:
            /// \param ctx_ptr pointer to the send context
            /// \param buf hints file entry
            /// \return The mutation object representing the original mutation stored in the hints file.
-            mutation get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
+            frozen_mutation_and_schema get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);

            /// \brief Get a reference to the column_mapping object for a given frozen mutation.
            /// \param ctx_ptr pointer to the send context
@@ -221,13 +239,13 @@ public:
            /// \param m mutation to send
            /// \param natural_endpoints current replicas for the given mutation
            /// \return future that resolves when the operation is complete
-            future<> do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
+            future<> do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;

            /// \brief Send one mutation out.
            ///
            /// \param m mutation to send
            /// \return future that resolves when the mutation sending processing is complete.
-            future<> send_one_mutation(mutation m);
+            future<> send_one_mutation(frozen_mutation_and_schema m);

            /// \brief Get the last modification time stamp for a given file.
            /// \param fname File name
@@ -322,6 +340,10 @@ public:
            return _hints_in_progress;
        }

+        bool replay_allowed() const noexcept {
+            return _shard_manager.replay_allowed();
+        }
+
        bool can_hint() const noexcept {
            return _state.contains(state::can_hint);
        }
@@ -387,6 +409,17 @@ public:
        }
    };

+    enum class state {
+        started,                // hinting is currently allowed (start() call is complete)
+        replay_allowed,         // replaying (hints sending) is allowed
+        stopping                // hinting is not allowed - stopping is in progress (stop() method has been called)
+    };
+
+    using state_set = enum_set<super_enum<state,
+        state::started,
+        state::replay_allowed,
+        state::stopping>>;
+
 private:
    using ep_key_type = typename end_point_hints_manager::key_type;
    using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;
@@ -397,6 +430,7 @@ public:
    static const std::chrono::seconds hint_file_write_timeout;

 private:
+    state_set _state;
    const boost::filesystem::path _hints_dir;
    dev_t _hints_dir_device_id = 0;

@@ -408,7 +442,7 @@ private:
    locator::snitch_ptr& _local_snitch_ptr;
    int64_t _max_hint_window_us = 0;
    database& _local_db;
-    bool _stopping = false;
+
    seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call

    resource_manager& _resource_manager;
@@ -418,9 +452,14 @@ private:
    seastar::metrics::metric_groups _metrics;
    std::unordered_set<ep_key_type> _eps_with_pending_hints;

+    size_t _max_backlog_size = 1;
+    size_t _backlog_size = 0;
+
 public:
    manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
    virtual ~manager();
+    manager(manager&&) = delete;
+    manager& operator=(manager&&) = delete;
    void register_metrics(const sstring& group_name);
    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
    future<> stop();
@@ -497,23 +536,101 @@ public:
    void forbid_hints();
    void forbid_hints_for_eps_with_pending_hints();

-
-    static future<> rebalance() {
-        // TODO
-        return make_ready_future<>();
+    size_t max_backlog_size() const {
+        return _max_backlog_size;
    }

+    size_t backlog_size() const {
+        return _backlog_size;
+    }
+
+    void allow_replaying() noexcept {
+        _state.set(state::replay_allowed);
+    }
+
+    /// \brief Rebalance hints segments among all present shards.
+    ///
+    /// The difference between the number of segments on every two shard will be not greater than 1 after the
+    /// rebalancing.
+    ///
+    /// Removes the sub-directories of \ref hints_directory that correspond to shards that are not relevant any more
+    /// (re-sharding to a lower shards number case).
+    ///
+    /// Complexity: O(N+K), where N is a total number of present hints' segments and
+    ///                           K = <number of shards during the previous boot> * <number of end points for which hints where ever created>
+    ///
+    /// \param hints_directory A hints directory to rebalance
+    /// \return A future that resolves when the operation is complete.
+    static future<> rebalance(sstring hints_directory);
+
    virtual void on_join_cluster(const gms::inet_address& endpoint) override {}
    virtual void on_leave_cluster(const gms::inet_address& endpoint) override {
        drain_for(endpoint);
    };
    virtual void on_up(const gms::inet_address& endpoint) override {}
    virtual void on_down(const gms::inet_address& endpoint) override {}
-    virtual void on_move(const gms::inet_address& endpoint) override {}

 private:
    future<> compute_hints_dir_device_id();

+    /// \brief Scan the given hints directory and build the map of all present hints segments.
+    ///
+    /// Complexity: O(N+K), where N is a total number of present hints' segments and
+    ///                           K = <number of shards during the previous boot> * <number of end points for which hints where ever created>
+    ///
+    /// \note Should be called from a seastar::thread context.
+    ///
+    /// \param hints_directory directory to scan
+    /// \return a map: ep -> map: shard -> segments (full paths)
+    static hints_segments_map get_current_hints_segments(const sstring& hints_directory);
+
+    /// \brief Rebalance hints segments for a given (destination) end point
+    ///
+    /// This method is going to consume files from the \ref segments_to_move and distribute them between the present
+    /// shards (taking into an account the \ref ep_segments state - there may be zero or more segments that belong to a
+    /// particular shard in it) until we either achieve the requested \ref segments_per_shard level on each shard
+    /// or until we are out of files to move.
+    ///
+    /// As a result (in addition to the actual state on the disk) both \ref ep_segments and \ref segments_to_move are going
+    /// to be modified.
+    ///
+    /// Complexity: O(N), where N is a total number of present hints' segments for the \ref ep end point (as a destination).
+    ///
+    /// \note Should be called from a seastar::thread context.
+    ///
+    /// \param ep destination end point ID (a string with its IP address)
+    /// \param segments_per_shard number of hints segments per-shard we want to achieve
+    /// \param hints_directory a root hints directory
+    /// \param ep_segments a map that was originally built by get_current_hints_segments() for this end point
+    /// \param segments_to_move a list of segments we are allowed to move
+    static void rebalance_segments_for(
+            const sstring& ep,
+            size_t segments_per_shard,
+            const sstring& hints_directory,
+            hints_ep_segments_map& ep_segments,
+            std::list<lister::path>& segments_to_move);
+
+    /// \brief Rebalance all present hints segments.
+    ///
+    /// The difference between the number of segments on every two shard will be not greater than 1 after the
+    /// rebalancing.
+    ///
+    /// Complexity: O(N), where N is a total number of present hints' segments.
+    ///
+    /// \note Should be called from a seastar::thread context.
+    ///
+    /// \param hints_directory a root hints directory
+    /// \param segments_map a map that was built by get_current_hints_segments()
+    static void rebalance_segments(const sstring& hints_directory, hints_segments_map& segments_map);
+
+    /// \brief Remove sub-directories of shards that are not relevant any more (re-sharding to a lower number of shards case).
+    ///
+    /// Complexity: O(S*E), where S is a number of shards during the previous boot and
+    ///                           E is a number of end points for which hints where ever created.
+    ///
+    /// \param hints_directory a root hints directory
+    static void remove_irrelevant_shards_directories(const sstring& hints_directory);
+
    node_to_hint_store_factory_type& store_factory() noexcept {
        return _store_factory;
    }
@@ -544,6 +661,28 @@ private:
    /// \param endpoint node that left the cluster
    void drain_for(gms::inet_address endpoint);

+    void update_backlog(size_t backlog, size_t max_backlog);
+
+    bool stopping() const noexcept {
+        return _state.contains(state::stopping);
+    }
+
+    void set_stopping() noexcept {
+        _state.set(state::stopping);
+    }
+
+    bool started() const noexcept {
+        return _state.contains(state::started);
+    }
+
+    void set_started() noexcept {
+        _state.set(state::started);
+    }
+
+    bool replay_allowed() const noexcept {
+        return _state.contains(state::replay_allowed);
+    }
+
 public:
    ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
        return _ep_managers.find(ep_key);
--- a/db/hints/resource_manager.cc
+++ b/db/hints/resource_manager.cc
@@ -27,6 +27,7 @@
 #include "lister.hh"
 #include "disk-error-handler.hh"
 #include "seastarx.hh"
+#include <seastar/core/sleep.hh>

 namespace db {
 namespace hints {
@@ -65,19 +66,28 @@ const std::chrono::seconds space_watchdog::_watchdog_period = std::chrono::secon
 space_watchdog::space_watchdog(shard_managers_set& managers, per_device_limits_map& per_device_limits_map)
    : _shard_managers(managers)
    , _per_device_limits_map(per_device_limits_map)
-    , _timer([this] { on_timer(); })
 {}

 void space_watchdog::start() {
-    _timer.arm(timer_clock_type::now());
+    _started = seastar::async([this] {
+        while (!_as.abort_requested()) {
+            try {
+                on_timer();
+            } catch (...) {
+                resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
+                // Stop all hint generators if space_watchdog callback failed
+                for (manager& shard_manager : _shard_managers) {
+                    shard_manager.forbid_hints();
+                }
+            }
+            seastar::sleep_abortable(_watchdog_period, _as).get();
+        }
+    }).handle_exception_type([] (const seastar::sleep_aborted& ignored) { });
 }

 future<> space_watchdog::stop() noexcept {
-    try {
-        return _gate.close().finally([this] { _timer.cancel(); });
-    } catch (...) {
-        return make_exception_future<>(std::current_exception());
-    }
+    _as.request_abort();
+    return std::move(_started);
 }

 future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
@@ -94,83 +104,62 @@ future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager&
    });
 }

+// Called from the context of a seastar::thread.
 void space_watchdog::on_timer() {
-    with_gate(_gate, [this] {
-        return futurize_apply([this] {
-            _total_size = 0;
+    // The hints directories are organized as follows:
+    // <hints root>
+    //    |- <shard1 ID>
+    //    |  |- <EP1 address>
+    //    |     |- <hints file1>
+    //    |     |- <hints file2>
+    //    |     |- ...
+    //    |  |- <EP2 address>
+    //    |     |- ...
+    //    |  |-...
+    //    |- <shard2 ID>
+    //    |  |- ...
+    //    ...
+    //    |- <shardN ID>
+    //    |  |- ...
+    //

-            return do_for_each(_shard_managers, [this] (manager& shard_manager) {
-                shard_manager.clear_eps_with_pending_hints();
-
-                // The hints directories are organized as follows:
-                // <hints root>
-                //    |- <shard1 ID>
-                //    |  |- <EP1 address>
-                //    |     |- <hints file1>
-                //    |     |- <hints file2>
-                //    |     |- ...
-                //    |  |- <EP2 address>
-                //    |     |- ...
-                //    |  |-...
-                //    |- <shard2 ID>
-                //    |  |- ...
-                //    ...
-                //    |- <shardN ID>
-                //    |  |- ...
+    for (auto& per_device_limits : _per_device_limits_map | boost::adaptors::map_values) {
+        _total_size = 0;
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.clear_eps_with_pending_hints();
+            lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
+                _files_count = 0;
+                // Let's scan per-end-point directories and enumerate hints files...
                //
-                return lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
-                    _files_count = 0;
-                    // Let's scan per-end-point directories and enumerate hints files...
-                    //
-                    // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
-                    // not hintable).
-                    // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
-                    // continue to enumeration - there is no one to change them.
-                    auto it = shard_manager.find_ep_manager(de.name);
-                    if (it != shard_manager.ep_managers_end()) {
-                        return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
-                             return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
-                        });
-                    } else {
-                        return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
-                    }
-                });
-            }).then([this] {
-                return do_for_each(_per_device_limits_map, [this](per_device_limits_map::value_type& per_device_limits_entry) {
-                    space_watchdog::per_device_limits& per_device_limits = per_device_limits_entry.second;
-
-                    size_t adjusted_quota = 0;
-                    size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
-                        return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
+                // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
+                // not hintable).
+                // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
+                // continue to enumeration - there is no one to change them.
+                auto it = shard_manager.find_ep_manager(de.name);
+                if (it != shard_manager.ep_managers_end()) {
+                    return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
+                        return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
                    });
-                    if (per_device_limits.max_shard_disk_space_size > delta) {
-                        adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
-                    }
+                } else {
+                    return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
+                }
+            }).get();
+        }

-                    bool can_hint = _total_size < adjusted_quota;
-                    resource_manager_logger.trace("space_watchdog: total_size ({}) {} max_shard_disk_space_size ({})", _total_size, can_hint ? "<" : ">=", adjusted_quota);
-
-                    if (!can_hint) {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.forbid_hints_for_eps_with_pending_hints();
-                        }
-                    } else {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.allow_hints();
-                        }
-    }
-                });
-            });
-        }).handle_exception([this] (auto eptr) {
-            resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
-            // Stop all hint generators if space_watchdog callback failed
-            for (manager& shard_manager : _shard_managers) {
-                shard_manager.forbid_hints();
-            }
-        }).finally([this] {
-            _timer.arm(_watchdog_period);
+        // Adjust the quota to take into account the space we guarantee to every end point manager
+        size_t adjusted_quota = 0;
+        size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
+            return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
        });
-    });
+        if (per_device_limits.max_shard_disk_space_size > delta) {
+            adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
+        }
+
+        resource_manager_logger.trace("space_watchdog: consuming {}/{} bytes", _total_size, adjusted_quota);
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.update_backlog(_total_size, adjusted_quota);
+        }
+    }
 }

 future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr) {
@@ -183,6 +172,10 @@ future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, s
    });
 }

+void resource_manager::allow_replaying() noexcept {
+    boost::for_each(_shard_managers, [] (manager& m) { m.allow_replaying(); });
+}
+
 future<> resource_manager::stop() noexcept {
    return parallel_for_each(_shard_managers, [](manager& m) {
        return m.stop();
@@ -201,14 +194,18 @@ future<> resource_manager::prepare_per_device_limits() {
        auto it = _per_device_limits_map.find(device_id);
        if (it == _per_device_limits_map.end()) {
            return is_mountpoint(shard_manager.hints_dir().parent_path()).then([this, device_id, &shard_manager](bool is_mountpoint) {
-                // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
-                size_t max_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
-                // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
-                // Then, reserve 90% of all space instead of 10% above.
-                if (is_mountpoint) {
-                    max_size *= 9;
+                auto [it, inserted] = _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{});
+                // Since we possibly deferred, we need to recheck the _per_device_limits_map.
+                if (inserted) {
+                    // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
+                    it->second.max_shard_disk_space_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
+                    // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
+                    // Then, reserve 90% of all space instead of 10% above.
+                    if (is_mountpoint) {
+                        it->second.max_shard_disk_space_size *= 9;
+                    }
                }
-                _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{{std::ref(shard_manager)}, max_size});
+                it->second.managers.emplace_back(std::ref(shard_manager));
            });
        } else {
            it->second.managers.emplace_back(std::ref(shard_manager));
--- a/Show More
+++ b/Show More