gossip: Fix tokens assignment in assassinate_endpoint

The tokens vector is defined a few lines above and is needed outsie the if block. Do not redefine it again in the if block, otherwise the tokens will be empty. Found by code inspection. Fixes #3551. Message-Id: <c7a06375c65c950e94236571127f533e5a60cbfd.1530002177.git.asias@scylladb.com> (cherry picked from commit c3b5a2ecd5)
locator::ec2_multi_region_snitch: don't call for ec2_snitch::gossiper_starting()
2026-04-26 19:35:12 +00:00 · 2018-06-27 12:01:19 +03:00 · 2018-06-12 19:02:48 +03:00 · 2018-05-24 12:02:15 +03:00 · 2018-05-24 11:14:20 +03:00 · 2018-05-24 11:08:13 +03:00
88 changed files with 3079 additions and 1168 deletions
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=2.0.1
+VERSION=2.0.4

 if test -f version
 then
--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -114,7 +114,7 @@ struct hash<auth::authenticated_user> {

 class auth::auth::permissions_cache {
 public:
-    typedef utils::loading_cache<std::pair<authenticated_user, data_resource>, permission_set, utils::tuple_hash> cache_type;
+    typedef utils::loading_cache<std::pair<authenticated_user, data_resource>, permission_set, utils::loading_cache_reload_enabled::yes, utils::simple_entry_size<permission_set>, utils::tuple_hash> cache_type;
    typedef typename cache_type::key_type key_type;

    permissions_cache()
--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -70,7 +70,7 @@ public:
    {
        if (!with_static_row) {
            if (_current == _end) {
-                _current_start = _current_end = position_in_partition_view::after_all_clustered_rows();
+                _current_start = position_in_partition_view::before_all_clustered_rows();
            } else {
                _current_start = position_in_partition_view::for_range_start(*_current);
                _current_end = position_in_partition_view::for_range_end(*_current);
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -241,7 +241,7 @@ public:
    using component_view = std::pair<bytes_view, eoc>;
 private:
    template<typename Value, typename = std::enable_if_t<!std::is_same<const data_value, std::decay_t<Value>>::value>>
-    static size_t size(Value& val) {
+    static size_t size(const Value& val) {
        return val.size();
    }
    static size_t size(const data_value& val) {
@@ -445,17 +445,16 @@ public:
        return _is_compound;
    }

-    // The following factory functions assume this composite is a compound value.
    template <typename ClusteringElement>
    static composite from_clustering_element(const schema& s, const ClusteringElement& ce) {
-        return serialize_value(ce.components(s));
+        return serialize_value(ce.components(s), s.is_compound());
    }

-    static composite from_exploded(const std::vector<bytes_view>& v, eoc marker = eoc::none) {
+    static composite from_exploded(const std::vector<bytes_view>& v, bool is_compound, eoc marker = eoc::none) {
        if (v.size() == 0) {
-            return composite(bytes(size_t(1), bytes::value_type(marker)));
+            return composite(bytes(size_t(1), bytes::value_type(marker)), is_compound);
        }
-        return serialize_value(v, true, marker);
+        return serialize_value(v, is_compound, marker);
    }

    static composite static_prefix(const schema& s) {
--- a/configure.py
+++ b/configure.py
@@ -238,6 +238,7 @@ scylla_tests = [
    'tests/view_schema_test',
    'tests/counter_test',
    'tests/cell_locker_test',
+    'tests/loading_cache_test',
 ]

 apps = [
@@ -730,6 +731,9 @@ if not try_compile(compiler=args.cxx, source='''\
    print('Installed boost version too old.  Please update {}.'.format(pkgname("boost-devel")))
    sys.exit(1)

+
+has_sanitize_address_use_after_scope = try_compile(compiler=args.cxx, flags=['-fsanitize-address-use-after-scope'], source='int f() {}')
+
 defines = ' '.join(['-D' + d for d in defines])

 globals().update(vars(args))
@@ -863,7 +867,7 @@ with open(buildfile, 'w') as f:
        f.write(textwrap.dedent('''\
            cxxflags_{mode} = -I. -I $builddir/{mode}/gen -I seastar -I seastar/build/{mode}/gen
            rule cxx.{mode}
-              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} -c -o $out $in
+              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags -c -o $out $in
              description = CXX $out
              depfile = $out.d
            rule link.{mode}
@@ -881,7 +885,16 @@ with open(buildfile, 'w') as f:
                command = thrift -gen cpp:cob_style -out $builddir/{mode}/gen $in
                description = THRIFT $in
            rule antlr3.{mode}
-                command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in && antlr3 $builddir/{mode}/gen/$in && sed -i 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' build/{mode}/gen/${{stem}}Parser.cpp
+                # We replace many local `ExceptionBaseType* ex` variables with a single function-scope one.
+                # Because we add such a variable to every function, and because `ExceptionBaseType` is not a global
+                # name, we also add a global typedef to avoid compilation errors. 
+                command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in $
+                     && antlr3 $builddir/{mode}/gen/$in $
+                     && sed -i -e 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' $
+                        -e '1i using ExceptionBaseType = int;' $
+                        -e 's/^{{/{{ ExceptionBaseType\* ex = nullptr;/; $
+                            s/ExceptionBaseType\* ex = new/ex = new/' $
+                        build/{mode}/gen/${{stem}}Parser.cpp
                description = ANTLR3 $in
            ''').format(mode = mode, **modeval))
        f.write('build {mode}: phony {artifacts}\n'.format(mode = mode,
@@ -998,6 +1011,9 @@ with open(buildfile, 'w') as f:
            for cc in grammar.sources('$builddir/{}/gen'.format(mode)):
                obj = cc.replace('.cpp', '.o')
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
+                if cc.endswith('Parser.cpp') and has_sanitize_address_use_after_scope:
+                    # Parsers end up using huge amounts of stack space and overflowing their stack 
+                    f.write('  obj_cxxflags = -fno-sanitize-address-use-after-scope\n')
        f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune seastar/build/{mode}/gen/http/request_parser.hh seastar/build/{mode}/gen/http/http_response_parser.hh: ninja {seastar_deps}\n'
                .format(**locals()))
        f.write('  pool = seastar_pool\n')
--- a/cql3/prepared_statements_cache.hh
+++ b/cql3/prepared_statements_cache.hh
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "utils/loading_cache.hh"
+#include "cql3/statements/prepared_statement.hh"
+
+namespace cql3 {
+
+using prepared_cache_entry = std::unique_ptr<statements::prepared_statement>;
+
+struct prepared_cache_entry_size {
+    size_t operator()(const prepared_cache_entry& val) {
+        // TODO: improve the size approximation
+        return 10000;
+    }
+};
+
+typedef bytes cql_prepared_id_type;
+typedef int32_t thrift_prepared_id_type;
+
+/// \brief The key of the prepared statements cache
+///
+/// We are going to store the CQL and Thrift prepared statements in the same cache therefore we need generate the key
+/// that is going to be unique in both cases. Thrift use int32_t as a prepared statement ID, CQL - MD5 digest.
+///
+/// We are going to use an std::pair<CQL_PREP_ID_TYPE, int64_t> as a key. For CQL statements we will use {CQL_PREP_ID, std::numeric_limits<int64_t>::max()} as a key
+/// and for Thrift - {CQL_PREP_ID_TYPE(0), THRIFT_PREP_ID}. This way CQL and Thrift keys' values will never collide.
+class prepared_cache_key_type {
+public:
+    using cache_key_type = std::pair<cql_prepared_id_type, int64_t>;
+
+private:
+    cache_key_type _key;
+
+public:
+    prepared_cache_key_type() = default;
+    explicit prepared_cache_key_type(cql_prepared_id_type cql_id) : _key(std::move(cql_id), std::numeric_limits<int64_t>::max()) {}
+    explicit prepared_cache_key_type(thrift_prepared_id_type thrift_id) : _key(cql_prepared_id_type(), thrift_id) {}
+
+    cache_key_type& key() { return _key; }
+    const cache_key_type& key() const { return _key; }
+
+    static const cql_prepared_id_type& cql_id(const prepared_cache_key_type& key) {
+        return key.key().first;
+    }
+    static thrift_prepared_id_type thrift_id(const prepared_cache_key_type& key) {
+        return key.key().second;
+    }
+};
+
+class prepared_statements_cache {
+public:
+    struct stats {
+        uint64_t prepared_cache_evictions = 0;
+    };
+
+    static stats& shard_stats() {
+        static thread_local stats _stats;
+        return _stats;
+    }
+
+    struct prepared_cache_stats_updater {
+        static void inc_hits() noexcept {}
+        static void inc_misses() noexcept {}
+        static void inc_blocks() noexcept {}
+        static void inc_evictions() noexcept {
+            ++shard_stats().prepared_cache_evictions;
+        }
+    };
+
+private:
+    using cache_key_type = typename prepared_cache_key_type::cache_key_type;
+    using cache_type = utils::loading_cache<cache_key_type, prepared_cache_entry, utils::loading_cache_reload_enabled::no, prepared_cache_entry_size, utils::tuple_hash, std::equal_to<cache_key_type>, prepared_cache_stats_updater>;
+    using cache_value_ptr = typename cache_type::value_ptr;
+    using cache_iterator = typename cache_type::iterator;
+    using checked_weak_ptr = typename statements::prepared_statement::checked_weak_ptr;
+    struct value_extractor_fn {
+        checked_weak_ptr operator()(prepared_cache_entry& e) const {
+            return e->checked_weak_from_this();
+        }
+    };
+
+    static const std::chrono::minutes entry_expiry;
+
+public:
+    using key_type = prepared_cache_key_type;
+    using value_type = checked_weak_ptr;
+    using statement_is_too_big = typename cache_type::entry_is_too_big;
+    /// \note both iterator::reference and iterator::value_type are checked_weak_ptr
+    using iterator = boost::transform_iterator<value_extractor_fn, cache_iterator>;
+
+private:
+    cache_type _cache;
+    value_extractor_fn _value_extractor_fn;
+
+public:
+    prepared_statements_cache(logging::logger& logger)
+        : _cache(memory::stats().total_memory() / 256, entry_expiry, logger)
+    {}
+
+    template <typename LoadFunc>
+    future<value_type> get(const key_type& key, LoadFunc&& load) {
+        return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
+            return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
+        });
+    }
+
+    iterator find(const key_type& key) {
+        return boost::make_transform_iterator(_cache.find(key.key()), _value_extractor_fn);
+    }
+
+    iterator end() {
+        return boost::make_transform_iterator(_cache.end(), _value_extractor_fn);
+    }
+
+    iterator begin() {
+        return boost::make_transform_iterator(_cache.begin(), _value_extractor_fn);
+    }
+
+    template <typename Pred>
+    void remove_if(Pred&& pred) {
+        static_assert(std::is_same<bool, std::result_of_t<Pred(::shared_ptr<cql_statement>)>>::value, "Bad Pred signature");
+
+        _cache.remove_if([&pred] (const prepared_cache_entry& e) {
+            return pred(e->statement);
+        });
+    }
+
+    size_t size() const {
+        return _cache.size();
+    }
+
+    size_t memory_footprint() const {
+        return _cache.memory_footprint();
+    }
+};
+}
+
+namespace std { // for prepared_statements_cache log printouts
+inline std::ostream& operator<<(std::ostream& os, const typename cql3::prepared_cache_key_type::cache_key_type& p) {
+    os << "{cql_id: " << p.first << ", thrift_id: " << p.second << "}";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const cql3::prepared_cache_key_type& p) {
+    os << p.key();
+    return os;
+}
+}
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -57,11 +57,14 @@ using namespace statements;
 using namespace cql_transport::messages;

 logging::logger log("query_processor");
+logging::logger prep_cache_log("prepared_statements_cache");

 distributed<query_processor> _the_query_processor;

 const sstring query_processor::CQL_VERSION = "3.3.1";

+const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono::minutes(60);
+
 class query_processor::internal_state {
    service::query_state _qs;
 public:
@@ -95,6 +98,7 @@ query_processor::query_processor(distributed<service::storage_proxy>& proxy,
    , _proxy(proxy)
    , _db(db)
    , _internal_state(new internal_state())
+    , _prepared_cache(prep_cache_log)
 {
    namespace sm = seastar::metrics;

@@ -130,6 +134,15 @@ query_processor::query_processor(distributed<service::storage_proxy>& proxy,

        sm::make_derive("batches_unlogged_from_logged", _cql_stats.batches_unlogged_from_logged,
                        sm::description("Counts a total number of LOGGED batches that were executed as UNLOGGED batches.")),
+
+        sm::make_derive("prepared_cache_evictions", [] { return prepared_statements_cache::shard_stats().prepared_cache_evictions; },
+                        sm::description("Counts a number of prepared statements cache entries evictions.")),
+
+        sm::make_gauge("prepared_cache_size", [this] { return _prepared_cache.size(); },
+                        sm::description("A number of entries in the prepared statements cache.")),
+
+        sm::make_gauge("prepared_cache_memory_footprint", [this] { return _prepared_cache.memory_footprint(); },
+                        sm::description("Size (in bytes) of the prepared statements cache.")),
    });

    service::get_local_migration_manager().register_listener(_migration_subscriber.get());
@@ -197,31 +210,21 @@ query_processor::process_statement(::shared_ptr<cql_statement> statement,
 }

 future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-query_processor::prepare(const std::experimental::string_view& query_string, service::query_state& query_state)
+query_processor::prepare(sstring query_string, service::query_state& query_state)
 {
    auto& client_state = query_state.get_client_state();
-    return prepare(query_string, client_state, client_state.is_thrift());
+    return prepare(std::move(query_string), client_state, client_state.is_thrift());
 }

 future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-query_processor::prepare(const std::experimental::string_view& query_string,
-                         const service::client_state& client_state,
-                         bool for_thrift)
+query_processor::prepare(sstring query_string, const service::client_state& client_state, bool for_thrift)
 {
-    auto existing = get_stored_prepared_statement(query_string, client_state.get_raw_keyspace(), for_thrift);
-    if (existing) {
-        return make_ready_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(existing);
+    using namespace cql_transport::messages;
+    if (for_thrift) {
+        return prepare_one<result_message::prepared::thrift>(std::move(query_string), client_state, compute_thrift_id, prepared_cache_key_type::thrift_id);
+    } else {
+        return prepare_one<result_message::prepared::cql>(std::move(query_string), client_state, compute_id, prepared_cache_key_type::cql_id);
    }
-
-    return futurize<::shared_ptr<cql_transport::messages::result_message::prepared>>::apply([this, &query_string, &client_state, for_thrift] {
-        auto prepared = get_statement(query_string, client_state);
-        auto bound_terms = prepared->statement->get_bound_terms();
-        if (bound_terms > std::numeric_limits<uint16_t>::max()) {
-            throw exceptions::invalid_request_exception(sprint("Too many markers(?). %d markers exceed the allowed maximum of %d", bound_terms, std::numeric_limits<uint16_t>::max()));
-        }
-        assert(bound_terms == prepared->bound_names.size());
-        return store_prepared_statement(query_string, client_state.get_raw_keyspace(), std::move(prepared), for_thrift);
-    });
 }

 ::shared_ptr<cql_transport::messages::result_message::prepared>
@@ -229,50 +232,11 @@ query_processor::get_stored_prepared_statement(const std::experimental::string_v
                                               const sstring& keyspace,
                                               bool for_thrift)
 {
+    using namespace cql_transport::messages;
    if (for_thrift) {
-        auto statement_id = compute_thrift_id(query_string, keyspace);
-        auto it = _thrift_prepared_statements.find(statement_id);
-        if (it == _thrift_prepared_statements.end()) {
-            return ::shared_ptr<result_message::prepared>();
-        }
-        return ::make_shared<result_message::prepared::thrift>(statement_id, it->second->checked_weak_from_this());
+        return get_stored_prepared_statement_one<result_message::prepared::thrift>(query_string, keyspace, compute_thrift_id, prepared_cache_key_type::thrift_id);
    } else {
-        auto statement_id = compute_id(query_string, keyspace);
-        auto it = _prepared_statements.find(statement_id);
-        if (it == _prepared_statements.end()) {
-            return ::shared_ptr<result_message::prepared>();
-        }
-        return ::make_shared<result_message::prepared::cql>(statement_id, it->second->checked_weak_from_this());
-    }
-}
-
-future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-query_processor::store_prepared_statement(const std::experimental::string_view& query_string,
-                                          const sstring& keyspace,
-                                          std::unique_ptr<statements::prepared_statement> prepared,
-                                          bool for_thrift)
-{
-#if 0
-    // Concatenate the current keyspace so we don't mix prepared statements between keyspace (#5352).
-    // (if the keyspace is null, queryString has to have a fully-qualified keyspace so it's fine.
-    long statementSize = measure(prepared.statement);
-    // don't execute the statement if it's bigger than the allowed threshold
-    if (statementSize > MAX_CACHE_PREPARED_MEMORY)
-        throw new InvalidRequestException(String.format("Prepared statement of size %d bytes is larger than allowed maximum of %d bytes.",
-                                                        statementSize,
-                                                        MAX_CACHE_PREPARED_MEMORY));
-#endif
-    prepared->raw_cql_statement = query_string.data();
-    if (for_thrift) {
-        auto statement_id = compute_thrift_id(query_string, keyspace);
-        auto msg = ::make_shared<result_message::prepared::thrift>(statement_id, prepared->checked_weak_from_this());
-        _thrift_prepared_statements.emplace(statement_id, std::move(prepared));
-        return make_ready_future<::shared_ptr<result_message::prepared>>(std::move(msg));
-    } else {
-        auto statement_id = compute_id(query_string, keyspace);
-        auto msg = ::make_shared<result_message::prepared::cql>(statement_id, prepared->checked_weak_from_this());
-        _prepared_statements.emplace(statement_id, std::move(prepared));
-        return make_ready_future<::shared_ptr<result_message::prepared>>(std::move(msg));
+        return get_stored_prepared_statement_one<result_message::prepared::cql>(query_string, keyspace, compute_id, prepared_cache_key_type::cql_id);
    }
 }

@@ -289,19 +253,19 @@ static sstring hash_target(const std::experimental::string_view& query_string, c
    return keyspace + query_string.to_string();
 }

-bytes query_processor::compute_id(const std::experimental::string_view& query_string, const sstring& keyspace)
+prepared_cache_key_type query_processor::compute_id(const std::experimental::string_view& query_string, const sstring& keyspace)
 {
-    return md5_calculate(hash_target(query_string, keyspace));
+    return prepared_cache_key_type(md5_calculate(hash_target(query_string, keyspace)));
 }

-int32_t query_processor::compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace)
+prepared_cache_key_type query_processor::compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace)
 {
    auto target = hash_target(query_string, keyspace);
    uint32_t h = 0;
    for (auto&& c : hash_target(query_string, keyspace)) {
        h = 31*h + c;
    }
-    return static_cast<int32_t>(h);
+    return prepared_cache_key_type(static_cast<int32_t>(h));
 }

 std::unique_ptr<prepared_statement>
@@ -527,7 +491,7 @@ void query_processor::migration_subscriber::on_drop_view(const sstring& ks_name,

 void query_processor::migration_subscriber::remove_invalid_prepared_statements(sstring ks_name, std::experimental::optional<sstring> cf_name)
 {
-    _qp->invalidate_prepared_statements([&] (::shared_ptr<cql_statement> stmt) {
+    _qp->_prepared_cache.remove_if([&] (::shared_ptr<cql_statement> stmt) {
        return this->should_invalidate(ks_name, cf_name, stmt);
    });
 }
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -57,6 +57,7 @@
 #include "statements/prepared_statement.hh"
 #include "transport/messages/result_message.hh"
 #include "untyped_result_set.hh"
+#include "prepared_statements_cache.hh"

 namespace cql3 {

@@ -64,9 +65,32 @@ namespace statements {
 class batch_statement;
 }

+class prepared_statement_is_too_big : public std::exception {
+public:
+    static constexpr int max_query_prefix = 100;
+
+private:
+    sstring _msg;
+
+public:
+    prepared_statement_is_too_big(const sstring& query_string)
+        : _msg(seastar::format("Prepared statement is too big: {}", query_string.substr(0, max_query_prefix)))
+    {
+        // mark that we clipped the query string
+        if (query_string.size() > max_query_prefix) {
+            _msg += "...";
+        }
+    }
+
+    virtual const char* what() const noexcept override {
+        return _msg.c_str();
+    }
+};
+
 class query_processor {
 public:
    class migration_subscriber;
+
 private:
    std::unique_ptr<migration_subscriber> _migration_subscriber;
    distributed<service::storage_proxy>& _proxy;
@@ -127,9 +151,7 @@ private:
        }
    };
 #endif
-
-    std::unordered_map<bytes, std::unique_ptr<statements::prepared_statement>> _prepared_statements;
-    std::unordered_map<int32_t, std::unique_ptr<statements::prepared_statement>> _thrift_prepared_statements;
+    prepared_statements_cache _prepared_cache;
    std::unordered_map<sstring, std::unique_ptr<statements::prepared_statement>> _internal_statements;
 #if 0

@@ -221,21 +243,14 @@ private:
    }
 #endif
 public:
-    statements::prepared_statement::checked_weak_ptr get_prepared(const bytes& id) {
-        auto it = _prepared_statements.find(id);
-        if (it == _prepared_statements.end()) {
+    statements::prepared_statement::checked_weak_ptr get_prepared(const prepared_cache_key_type& key) {
+        auto it = _prepared_cache.find(key);
+        if (it == _prepared_cache.end()) {
            return statements::prepared_statement::checked_weak_ptr();
        }
-        return it->second->checked_weak_from_this();
+        return *it;
    }

-    statements::prepared_statement::checked_weak_ptr get_prepared_for_thrift(int32_t id) {
-        auto it = _thrift_prepared_statements.find(id);
-        if (it == _thrift_prepared_statements.end()) {
-            return statements::prepared_statement::checked_weak_ptr();
-        }
-        return it->second->checked_weak_from_this();
-    }
 #if 0
    public static void validateKey(ByteBuffer key) throws InvalidRequestException
    {
@@ -435,42 +450,61 @@ public:
 #endif

    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-    prepare(const std::experimental::string_view& query_string, service::query_state& query_state);
+    prepare(sstring query_string, service::query_state& query_state);

    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-    prepare(const std::experimental::string_view& query_string, const service::client_state& client_state, bool for_thrift);
+    prepare(sstring query_string, const service::client_state& client_state, bool for_thrift);

-    static bytes compute_id(const std::experimental::string_view& query_string, const sstring& keyspace);
-    static int32_t compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace);
+    static prepared_cache_key_type compute_id(const std::experimental::string_view& query_string, const sstring& keyspace);
+    static prepared_cache_key_type compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace);

 private:
+    ///
+    /// \tparam ResultMsgType type of the returned result message (CQL or Thrift)
+    /// \tparam PreparedKeyGenerator a function that generates the prepared statement cache key for given query and keyspace
+    /// \tparam IdGetter a function that returns the corresponding prepared statement ID (CQL or Thrift) for a given prepared statement cache key
+    /// \param query_string
+    /// \param client_state
+    /// \param id_gen prepared ID generator, called before the first deferring
+    /// \param id_getter prepared ID getter, passed to deferred context by reference. The caller must ensure its liveness.
+    /// \return
+    template <typename ResultMsgType, typename PreparedKeyGenerator, typename IdGetter>
+    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
+    prepare_one(sstring query_string, const service::client_state& client_state, PreparedKeyGenerator&& id_gen, IdGetter&& id_getter) {
+        return do_with(id_gen(query_string, client_state.get_raw_keyspace()), std::move(query_string), [this, &client_state, &id_getter] (const prepared_cache_key_type& key, const sstring& query_string) {
+            return _prepared_cache.get(key, [this, &query_string, &client_state] {
+                auto prepared = get_statement(query_string, client_state);
+                auto bound_terms = prepared->statement->get_bound_terms();
+                if (bound_terms > std::numeric_limits<uint16_t>::max()) {
+                    throw exceptions::invalid_request_exception(sprint("Too many markers(?). %d markers exceed the allowed maximum of %d", bound_terms, std::numeric_limits<uint16_t>::max()));
+                }
+                assert(bound_terms == prepared->bound_names.size());
+                prepared->raw_cql_statement = query_string;
+                return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
+            }).then([&key, &id_getter] (auto prep_ptr) {
+                return make_ready_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(::make_shared<ResultMsgType>(id_getter(key), std::move(prep_ptr)));
+            }).handle_exception_type([&query_string] (typename prepared_statements_cache::statement_is_too_big&) {
+                return make_exception_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(prepared_statement_is_too_big(query_string));
+            });
+        });
+    };
+
+    template <typename ResultMsgType, typename KeyGenerator, typename IdGetter>
+    ::shared_ptr<cql_transport::messages::result_message::prepared>
+    get_stored_prepared_statement_one(const std::experimental::string_view& query_string, const sstring& keyspace, KeyGenerator&& key_gen, IdGetter&& id_getter)
+    {
+        auto cache_key = key_gen(query_string, keyspace);
+        auto it = _prepared_cache.find(cache_key);
+        if (it == _prepared_cache.end()) {
+            return ::shared_ptr<cql_transport::messages::result_message::prepared>();
+        }
+
+        return ::make_shared<ResultMsgType>(id_getter(cache_key), *it);
+    }
+
    ::shared_ptr<cql_transport::messages::result_message::prepared>
    get_stored_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, bool for_thrift);

-    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
-    store_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, std::unique_ptr<statements::prepared_statement> prepared, bool for_thrift);
-
-    // Erases the statements for which filter returns true.
-    template <typename Pred>
-    void invalidate_prepared_statements(Pred filter) {
-        static_assert(std::is_same<bool, std::result_of_t<Pred(::shared_ptr<cql_statement>)>>::value,
-                      "bad Pred signature");
-        for (auto it = _prepared_statements.begin(); it != _prepared_statements.end(); ) {
-            if (filter(it->second->statement)) {
-                it = _prepared_statements.erase(it);
-            } else {
-                ++it;
-            }
-        }
-        for (auto it = _thrift_prepared_statements.begin(); it != _thrift_prepared_statements.end(); ) {
-            if (filter(it->second->statement)) {
-                it = _thrift_prepared_statements.erase(it);
-            } else {
-                ++it;
-            }
-        }
-    }
-
 #if 0
    public ResultMessage processPrepared(CQLStatement statement, QueryState queryState, QueryOptions options)
    throws RequestExecutionException, RequestValidationException
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -101,6 +101,10 @@ public:
        return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->has_bound(b); });
    }

+    virtual bool is_inclusive(statements::bound b) const override {
+        return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->is_inclusive(b); });
+    }
+
    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
        return _restrictions->uses_function(ks_name, function_name);
    }
--- a/cql3/statements/create_user_statement.cc
+++ b/cql3/statements/create_user_statement.cc
@@ -75,7 +75,7 @@ cql3::statements::create_user_statement::execute(distributed<service::storage_pr
                throw exceptions::invalid_request_exception(sprint("User %s already exists", _username));
            }
            if (exists && _if_not_exists) {
-                make_ready_future<::shared_ptr<cql_transport::messages::result_message>>();
+                return make_ready_future<::shared_ptr<cql_transport::messages::result_message>>();
            }
            return auth::authenticator::get().create(_username, _opts->options()).then([this] {
                return auth::auth::insert_user(_username, _superuser).then([] {
--- a/cql3/statements/delete_statement.cc
+++ b/cql3/statements/delete_statement.cc
@@ -106,6 +106,9 @@ delete_statement::prepare_internal(database& db, schema_ptr schema, shared_ptr<v
            || !stmt->restrictions()->get_clustering_columns_restrictions()->has_bound(bound::END)) {
        throw exceptions::invalid_request_exception("A range deletion operation needs to specify both bounds");
    }
+    if (!schema->is_compound() && stmt->restrictions()->get_clustering_columns_restrictions()->is_slice()) {
+        throw exceptions::invalid_request_exception("Range deletions on \"compact storage\" schemas are not supported");
+    }
    return stmt;
 }

--- a/database.cc
+++ b/database.cc
@@ -886,7 +886,8 @@ column_family::seal_active_streaming_memtable_immediate() {
                        return old->clear_gently();
                    }
                });
-            }).handle_exception([old] (auto ep) {
+            }).handle_exception([old, newtab] (auto ep) {
+                newtab->mark_for_deletion();
                dblog.error("failed to write streamed sstable: {}", ep);
                return make_exception_future<>(ep);
            });
@@ -924,7 +925,8 @@ future<> column_family::seal_active_streaming_memtable_big(streaming_memtable_bi
                auto&& priority = service::get_local_streaming_write_priority();
                return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, true, _config.background_writer_scheduling_group).then([this, newtab, old, &smb] {
                    smb.sstables.emplace_back(newtab);
-                }).handle_exception([] (auto ep) {
+                }).handle_exception([newtab] (auto ep) {
+                    newtab->mark_for_deletion();
                    dblog.error("failed to write streamed sstable: {}", ep);
                    return make_exception_future<>(ep);
                });
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -64,8 +64,11 @@
 #include "db/config.hh"
 #include "md5_hasher.hh"

+#include <seastar/util/noncopyable_function.hh>
+
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/range/algorithm/copy.hpp>
+#include <boost/range/algorithm/transform.hpp>
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/join.hpp>

@@ -126,7 +129,11 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
    std::map<qualified_name, schema_mutations>&& views_before,
    std::map<qualified_name, schema_mutations>&& views_after);

-static void merge_types(distributed<service::storage_proxy>& proxy,
+struct user_types_to_drop final {
+    seastar::noncopyable_function<void()> drop;
+};
+
+static user_types_to_drop merge_types(distributed<service::storage_proxy>& proxy,
    schema_result&& before,
    schema_result&& after);

@@ -832,7 +839,7 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
 #endif

       std::set<sstring> keyspaces_to_drop = merge_keyspaces(proxy, std::move(old_keyspaces), std::move(new_keyspaces)).get0();
-       merge_types(proxy, std::move(old_types), std::move(new_types));
+       auto types_to_drop = merge_types(proxy, std::move(old_types), std::move(new_types));
       merge_tables_and_views(proxy,
            std::move(old_column_families), std::move(new_column_families),
            std::move(old_views), std::move(new_views));
@@ -840,6 +847,8 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
       mergeFunctions(oldFunctions, newFunctions);
       mergeAggregates(oldAggregates, newAggregates);
 #endif
+       types_to_drop.drop();
+
       proxy.local().get_db().invoke_on_all([keyspaces_to_drop = std::move(keyspaces_to_drop)] (database& db) {
           // it is safe to drop a keyspace only when all nested ColumnFamilies where deleted
           return do_for_each(keyspaces_to_drop, [&db] (auto keyspace_to_drop) {
@@ -996,30 +1005,37 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
    }).get();
 }

-static inline void collect_types(std::set<sstring>& keys, schema_result& result, std::vector<user_type>& to)
+struct naked_user_type {
+    const sstring keyspace;
+    const sstring qualified_name;
+};
+
+static inline void collect_types(std::set<sstring>& keys, schema_result& result, std::vector<naked_user_type>& to)
 {
    for (auto&& key : keys) {
        auto&& value = result[key];
        auto types = create_types_from_schema_partition(schema_result_value_type{key, std::move(value)});
-        std::move(types.begin(), types.end(), std::back_inserter(to));
+        boost::transform(types, std::back_inserter(to), [] (user_type type) {
+            return naked_user_type{std::move(type->_keyspace), std::move(type->name())};
+        });
    }
 }

- // see the comments for merge_keyspaces()
-static void merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
+// see the comments for merge_keyspaces()
+static user_types_to_drop merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
 {
-    std::vector<user_type> created, altered, dropped;
+    std::vector<naked_user_type> created, altered, dropped;

    auto diff = difference(before, after, indirect_equal_to<lw_shared_ptr<query::result_set>>());

    collect_types(diff.entries_only_on_left, before, dropped); // Keyspaces with no more types
    collect_types(diff.entries_only_on_right, after, created); // New keyspaces with types

-    for (auto&& key : diff.entries_differing) {
+    for (auto&& keyspace : diff.entries_differing) {
        // The user types of this keyspace differ, so diff the current types with the updated ones
-        auto current_types = proxy.local().get_db().local().find_keyspace(key).metadata()->user_types()->get_all_types();
+        auto current_types = proxy.local().get_db().local().find_keyspace(keyspace).metadata()->user_types()->get_all_types();
        decltype(current_types) updated_types;
-        auto ts = create_types_from_schema_partition(schema_result_value_type{key, std::move(after[key])});
+        auto ts = create_types_from_schema_partition(schema_result_value_type{keyspace, std::move(after[keyspace])});
        updated_types.reserve(ts.size());
        for (auto&& type : ts) {
            updated_types[type->_name] = std::move(type);
@@ -1027,36 +1043,46 @@ static void merge_types(distributed<service::storage_proxy>& proxy, schema_resul

        auto delta = difference(current_types, updated_types, indirect_equal_to<user_type>());

-        for (auto&& key : delta.entries_only_on_left) {
-            dropped.emplace_back(current_types[key]);
+        for (auto&& type_name : delta.entries_only_on_left) {
+            dropped.emplace_back(naked_user_type{keyspace, current_types[type_name]->name()});
        }
-        for (auto&& key : delta.entries_only_on_right) {
-            created.emplace_back(std::move(updated_types[key]));
+        for (auto&& type_name : delta.entries_only_on_right) {
+            created.emplace_back(naked_user_type{keyspace, updated_types[type_name]->name()});
        }
-        for (auto&& key : delta.entries_differing) {
-            altered.emplace_back(std::move(updated_types[key]));
+        for (auto&& type_name : delta.entries_differing) {
+            altered.emplace_back(naked_user_type{keyspace, updated_types[type_name]->name()});
        }
    }

-    proxy.local().get_db().invoke_on_all([&created, &dropped, &altered] (database& db) {
+    // Create and update user types before any tables/views are created that potentially
+    // use those types. Similarly, defer dropping until after tables/views that may use
+    // some of these user types are dropped.
+
+    proxy.local().get_db().invoke_on_all([&created, &altered] (database& db) {
        return seastar::async([&] {
            for (auto&& type : created) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type.qualified_name));
                db.find_keyspace(user_type->_keyspace).add_user_type(user_type);
                service::get_local_migration_manager().notify_create_user_type(user_type).get();
            }
-            for (auto&& type : dropped) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
-                db.find_keyspace(user_type->_keyspace).remove_user_type(user_type);
-                service::get_local_migration_manager().notify_drop_user_type(user_type).get();
-            }
            for (auto&& type : altered) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type.qualified_name));
                db.find_keyspace(user_type->_keyspace).add_user_type(user_type);
                service::get_local_migration_manager().notify_update_user_type(user_type).get();
            }
        });
    }).get();
+
+    return user_types_to_drop{[&proxy, dropped = std::move(dropped)] {
+        proxy.local().get_db().invoke_on_all([dropped = std::move(dropped)](database& db) {
+            return do_for_each(dropped, [&db](auto& user_type_to_drop) {
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(
+                        parse_type(std::move(user_type_to_drop.qualified_name)));
+                db.find_keyspace(user_type->_keyspace).remove_user_type(user_type);
+                return service::get_local_migration_manager().notify_drop_user_type(user_type);
+            });
+        }).get();
+    }};
 }

 #if 0
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -194,13 +194,13 @@ public:
            : _view(std::move(view))
            , _view_info(*_view->view_info())
            , _base(std::move(base))
-            , _updates(8, partition_key::hashing(*_base), partition_key::equality(*_base)) {
+            , _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
    }

    void move_to(std::vector<mutation>& mutations) && {
        auto& partitioner = dht::global_partitioner();
        std::transform(_updates.begin(), _updates.end(), std::back_inserter(mutations), [&, this] (auto&& m) {
-            return mutation(_view, partitioner.decorate_key(*_base, std::move(m.first)), std::move(m.second));
+            return mutation(_view, partitioner.decorate_key(*_view, std::move(m.first)), std::move(m.second));
        });
    }

--- a/dht/boot_strapper.cc
+++ b/dht/boot_strapper.cc
@@ -59,14 +59,11 @@ future<> boot_strapper::bootstrap() {
        streamer->add_ranges(keyspace_name, ranges);
    }

-    return streamer->fetch_async().then_wrapped([streamer] (auto&& f) {
-        try {
-            auto state = f.get0();
-        } catch (...) {
-            throw std::runtime_error(sprint("Error during boostrap: %s", std::current_exception()));
-        }
+    return streamer->stream_async().then([streamer] () {
        service::get_local_storage_service().finish_bootstrapping();
-        return make_ready_future<>();
+    }).handle_exception([streamer] (std::exception_ptr eptr) {
+        blogger.warn("Eror during bootstrap: {}", eptr);
+        return make_exception_future<>(std::move(eptr));
    });
 }

--- a/dht/range_streamer.cc
+++ b/dht/range_streamer.cc
@@ -210,7 +210,36 @@ bool range_streamer::use_strict_sources_for_ranges(const sstring& keyspace_name)
           && _metadata.get_all_endpoints().size() != strat.get_replication_factor();
 }

+void range_streamer::add_tx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families) {
+    if (_nr_rx_added) {
+        throw std::runtime_error("Mixed sending and receiving is not supported");
+    }
+    _nr_tx_added++;
+    _to_stream.emplace(keyspace_name, std::move(ranges_per_endpoint));
+    auto inserted = _column_families.emplace(keyspace_name, std::move(column_families)).second;
+    if (!inserted) {
+        throw std::runtime_error("Can not add column_families for the same keyspace more than once");
+    }
+}
+
+void range_streamer::add_rx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families) {
+    if (_nr_tx_added) {
+        throw std::runtime_error("Mixed sending and receiving is not supported");
+    }
+    _nr_rx_added++;
+    _to_stream.emplace(keyspace_name, std::move(ranges_per_endpoint));
+    auto inserted = _column_families.emplace(keyspace_name, std::move(column_families)).second;
+    if (!inserted) {
+        throw std::runtime_error("Can not add column_families for the same keyspace more than once");
+    }
+}
+
+// TODO: This is the legacy range_streamer interface, it is add_rx_ranges which adds rx ranges.
 void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges) {
+    if (_nr_tx_added) {
+        throw std::runtime_error("Mixed sending and receiving is not supported");
+    }
+    _nr_rx_added++;
    auto ranges_for_keyspace = use_strict_sources_for_ranges(keyspace_name)
        ? get_all_ranges_with_strict_sources_for(keyspace_name, ranges)
        : get_all_ranges_with_sources_for(keyspace_name, ranges);
@@ -231,26 +260,114 @@ void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_v
            logger.debug("{} : range {} from source {} for keyspace {}", _description, x.second, x.first, keyspace_name);
        }
    }
-    _to_fetch.emplace(keyspace_name, std::move(range_fetch_map));
+    _to_stream.emplace(keyspace_name, std::move(range_fetch_map));
 }

-future<streaming::stream_state> range_streamer::fetch_async() {
-    for (auto& fetch : _to_fetch) {
-        const auto& keyspace = fetch.first;
-        for (auto& x : fetch.second) {
-            auto& source = x.first;
-            auto& ranges = x.second;
-            /* Send messages to respective folks to stream data over to me */
-            if (logger.is_enabled(logging::log_level::debug)) {
-                logger.debug("{}ing from {} ranges {}", _description, source, ranges);
+future<> range_streamer::stream_async() {
+    return seastar::async([this] {
+        int sleep_time = 60;
+        for (;;) {
+            try {
+                do_stream_async().get();
+                break;
+            } catch (...) {
+                logger.warn("{} failed to stream. Will retry in {} seconds ...", _description, sleep_time);
+                sleep_abortable(std::chrono::seconds(sleep_time)).get();
+                sleep_time *= 1.5;
+                if (++_nr_retried >= _nr_max_retry) {
+                    throw;
+                }
            }
-            _stream_plan.request_ranges(source, keyspace, ranges);
+        }
+    });
+}
+
+future<> range_streamer::do_stream_async() {
+    auto nr_ranges_remaining = nr_ranges_to_stream();
+    logger.info("{} starts, nr_ranges_remaining={}", _description, nr_ranges_remaining);
+    auto start = lowres_clock::now();
+    return do_for_each(_to_stream, [this, start, description = _description] (auto& stream) {
+        const auto& keyspace = stream.first;
+        auto& ip_range_vec = stream.second;
+        // Fetch from or send to peer node in parallel
+        return parallel_for_each(ip_range_vec, [this, description, keyspace] (auto& ip_range) {
+            auto& source = ip_range.first;
+            auto& range_vec = ip_range.second;
+            return seastar::async([this, description, keyspace, source, &range_vec] () mutable {
+                // TODO: It is better to use fiber instead of thread here because
+                // creating a thread per peer can be some memory in a large cluster.
+                auto start_time = lowres_clock::now();
+                unsigned sp_index = 0;
+                unsigned nr_ranges_streamed = 0;
+                size_t nr_ranges_total = range_vec.size();
+                size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
+                dht::token_range_vector ranges_to_stream;
+                auto do_streaming = [&] {
+                    auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++));
+                    logger.info("{} with {} for keyspace={}, {} out of {} ranges: ranges = {}",
+                            description, source, keyspace, nr_ranges_streamed, nr_ranges_total, ranges_to_stream.size());
+                    if (_nr_rx_added) {
+                        sp.request_ranges(source, keyspace, ranges_to_stream, _column_families[keyspace]);
+                    } else if (_nr_tx_added) {
+                        sp.transfer_ranges(source, keyspace, ranges_to_stream, _column_families[keyspace]);
+                    }
+                    sp.execute().discard_result().get();
+                    ranges_to_stream.clear();
+                };
+                try {
+                    for (auto it = range_vec.begin(); it < range_vec.end();) {
+                        ranges_to_stream.push_back(*it);
+                        it = range_vec.erase(it);
+                        nr_ranges_streamed++;
+                        if (ranges_to_stream.size() < nr_ranges_per_stream_plan) {
+                            continue;
+                        } else {
+                            do_streaming();
+                        }
+                    }
+                    if (ranges_to_stream.size() > 0) {
+                        do_streaming();
+                    }
+                } catch (...) {
+                    for (auto& range : ranges_to_stream) {
+                        range_vec.push_back(range);
+                    }
+                    auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
+                    logger.warn("{} with {} for keyspace={} failed, took {} seconds: {}", description, source, keyspace, t, std::current_exception());
+                    throw;
+                }
+                auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
+                logger.info("{} with {} for keyspace={} succeeded, took {} seconds", description, source, keyspace, t);
+            });
+
+        });
+    }).finally([this, start] {
+        auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start).count();
+        auto nr_ranges_remaining = nr_ranges_to_stream();
+        if (nr_ranges_remaining) {
+            logger.warn("{} failed, took {} seconds, nr_ranges_remaining={}", _description, t, nr_ranges_remaining);
+        } else {
+            logger.info("{} succeeded, took {} seconds, nr_ranges_remaining={}", _description, t, nr_ranges_remaining);
+        }
+    });
+}
+
+size_t range_streamer::nr_ranges_to_stream() {
+    size_t nr_ranges_remaining = 0;
+    for (auto& fetch : _to_stream) {
+        const auto& keyspace = fetch.first;
+        auto& ip_range_vec = fetch.second;
+        for (auto& ip_range : ip_range_vec) {
+            auto& source = ip_range.first;
+            auto& range_vec = ip_range.second;
+            nr_ranges_remaining += range_vec.size();
+            logger.debug("Remaining: keyspace={}, source={}, ranges={}", keyspace, source, range_vec);
        }
    }
-
-    return _stream_plan.execute();
+    return nr_ranges_remaining;
 }

+
 std::unordered_multimap<inet_address, dht::token_range>
 range_streamer::get_work_map(const std::unordered_multimap<dht::token_range, inet_address>& ranges_with_source_target,
             const sstring& keyspace) {
--- a/dht/range_streamer.hh
+++ b/dht/range_streamer.hh
@@ -119,6 +119,8 @@ public:
    }

    void add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges);
+    void add_tx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families = {});
+    void add_rx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families = {});
 private:
    bool use_strict_sources_for_ranges(const sstring& keyspace_name);
    /**
@@ -159,16 +161,25 @@ public:
    }
 #endif
 public:
-    future<streaming::stream_state> fetch_async();
+    future<> stream_async();
+    future<> do_stream_async();
+    size_t nr_ranges_to_stream();
 private:
    distributed<database>& _db;
    token_metadata& _metadata;
    std::unordered_set<token> _tokens;
    inet_address _address;
    sstring _description;
-    std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_fetch;
+    std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_stream;
    std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
    stream_plan _stream_plan;
+    std::unordered_map<sstring, std::vector<sstring>> _column_families;
+    // Retry the stream plan _nr_max_retry times
+    unsigned _nr_retried = 0;
+    unsigned _nr_max_retry = 5;
+    // Number of tx and rx ranges added
+    unsigned _nr_tx_added = 0;
+    unsigned _nr_rx_added = 0;
 };

 } // dht
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/modprobe.d/scylla-raid0.conf
+++ b/dist/common/modprobe.d/scylla-raid0.conf
@@ -1 +0,0 @@
-options raid0 devices_discard_performance=Y
--- a/dist/common/systemd/scylla-housekeeping-daily.service.in
+++ b/dist/common/systemd/scylla-housekeeping-daily.service.in
@@ -6,7 +6,7 @@ After=network.target
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/yum.repos.d/scylla*.repo' -q -c /etc/scylla.d/housekeeping.cfg version --mode d
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode d

 [Install]
 WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-housekeeping-restart.service.in
+++ b/dist/common/systemd/scylla-housekeeping-restart.service.in
@@ -6,7 +6,7 @@ After=network.target
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q --repo-files '/etc/yum.repos.d/scylla*.repo' -c /etc/scylla.d/housekeeping.cfg version --mode r
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode r

 [Install]
 WantedBy=multi-user.target
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -196,8 +196,10 @@ else
 fi
 cp dist/common/systemd/scylla-server.service.in debian/scylla-server.service
 sed -i -e "s#@@SYSCONFDIR@@#/etc/default#g" debian/scylla-server.service
-cp dist/common/systemd/scylla-housekeeping-daily.service debian/scylla-server.scylla-housekeeping-daily.service
-cp dist/common/systemd/scylla-housekeeping-restart.service debian/scylla-server.scylla-housekeeping-restart.service
+cp dist/common/systemd/scylla-housekeeping-daily.service.in debian/scylla-server.scylla-housekeeping-daily.service
+sed -i -e "s#@@REPOFILES@@#'/etc/apt/sources.list.d/scylla*.list'#g" debian/scylla-server.scylla-housekeeping-daily.service
+cp dist/common/systemd/scylla-housekeeping-restart.service.in debian/scylla-server.scylla-housekeeping-restart.service
+sed -i -e "s#@@REPOFILES@@#'/etc/apt/sources.list.d/scylla*.list'#g" debian/scylla-server.scylla-housekeeping-restart.service
 cp dist/common/systemd/node-exporter.service debian/scylla-server.node-exporter.service

 if [ $REBUILD -eq 1 ]; then
--- a/dist/debian/control.in
+++ b/dist/debian/control.in
@@ -40,7 +40,7 @@ Description: Scylla kernel tuning configuration
 Package: scylla
 Section: metapackages
 Architecture: any
-Depends: scylla-server, scylla-jmx, scylla-tools, scylla-kernel-conf
+Depends: scylla-server, scylla-jmx, scylla-tools, scylla-tools-core, scylla-kernel-conf
 Description: Scylla database metapackage
 Scylla is a highly scalable, eventually consistent, distributed,
 partitioned row DB.
--- a/dist/redhat/build_rpm.sh
+++ b/dist/redhat/build_rpm.sh
@@ -104,9 +104,9 @@ fi


 if [ $JOBS -gt 0 ]; then
-    SRPM_OPTS="$SRPM_OPTS --define='_smp_mflags -j$JOBS'"
+    RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
 fi
-sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS
+sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
 if [ "$TARGET" = "epel-7-x86_64" ] && [ $REBUILD = 1 ]; then
    ./dist/redhat/centos_dep/build_dependency.sh
    sudo mock --init --root=$TARGET
@@ -116,4 +116,4 @@ elif [ "$TARGET" = "epel-7-x86_64" ] && [ $REBUILD = 0 ]; then
    TARGET=scylla-$TARGET
    RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
 fi
-sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS build/srpms/scylla-$VERSION*.src.rpm
+sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/scylla-$VERSION*.src.rpm
--- a/dist/redhat/centos_dep/binutils.diff
+++ b/dist/redhat/centos_dep/binutils.diff
@@ -33,8 +33,8 @@
 Requires(post): coreutils
 -Requires(post): %{_sbindir}/alternatives
 -Requires(preun): %{_sbindir}/alternatives
-+Requires(post): /sbin/alternatives
-+Requires(preun): /sbin/alternatives
+Requires(post): /usr/sbin/alternatives
+Requires(preun): /usr/sbin/alternatives
 %endif
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
@@ -58,13 +58,13 @@
 %if "%{build_gold}" == "both"
 %__rm -f %{_bindir}/%{?cross}ld
 -%{_sbindir}/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
-+/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
+/usr/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
   %{_bindir}/%{?cross}ld.bfd %{ld_bfd_priority}
 -%{_sbindir}/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
-+/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
+/usr/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
   %{_bindir}/%{?cross}ld.gold %{ld_gold_priority}
 -%{_sbindir}/alternatives --auto %{?cross}ld 
-+/sbin/alternatives --auto %{?cross}ld 
+/usr/sbin/alternatives --auto %{?cross}ld 
 %endif
 %if %{isnative}
 /sbin/ldconfig
@@ -74,8 +74,8 @@
 if [ $1 = 0 ]; then
 -  %{_sbindir}/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.bfd
 -  %{_sbindir}/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.gold
-+  /sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.bfd
-+  /sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.gold
+  /usr/sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.bfd
+  /usr/sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.gold
 fi
 %endif
 %if %{isnative}
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -7,14 +7,14 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{name}-@@VERSION@@-@@RELEASE@@.tar
-Requires:       scylla-server = @@VERSION@@ scylla-jmx = @@VERSION@@ scylla-tools = @@VERSION@@ scylla-kernel-conf = @@VERSION@@
+Requires:       scylla-server = @@VERSION@@ scylla-jmx = @@VERSION@@ scylla-tools = @@VERSION@@ scylla-tools-core = @@VERSION@@ scylla-kernel-conf = @@VERSION@@
 Obsoletes:	scylla-server < 1.1

 %description
 Scylla is a highly scalable, eventually consistent, distributed,
 partitioned row DB.
 This package installs all required packages for ScyllaDB,  including
-scylla-server, scylla-jmx, scylla-tools.
+scylla-server, scylla-jmx, scylla-tools, scylla-tools-core.

 # this is needed to prevent python compilation error on CentOS (#2235)
 %if 0%{?rhel}
@@ -78,6 +78,10 @@ python3.4 ./configure.py --enable-dpdk --mode=release --static-stdc++ --static-b
 ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
 cp dist/common/systemd/scylla-server.service.in build/scylla-server.service
 sed -i -e "s#@@SYSCONFDIR@@#/etc/sysconfig#g" build/scylla-server.service
+cp dist/common/systemd/scylla-housekeeping-restart.service.in build/scylla-housekeeping-restart.service
+sed -i -e "s#@@REPOFILES@@#'/etc/yum.repos.d/scylla*.repo'#g" build/scylla-housekeeping-restart.service
+cp dist/common/systemd/scylla-housekeeping-daily.service.in build/scylla-housekeeping-daily.service
+sed -i -e "s#@@REPOFILES@@#'/etc/yum.repos.d/scylla*.repo'#g" build/scylla-housekeeping-daily.service

 %install
 rm -rf $RPM_BUILD_ROOT
@@ -88,9 +92,6 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
-%if 0%{?rhel}
-mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
-%endif
 mkdir -p $RPM_BUILD_ROOT%{_sysctldir}/
 mkdir -p $RPM_BUILD_ROOT%{_docdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_unitdir}
@@ -101,9 +102,6 @@ install -m644 dist/common/limits.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/sec
 install -m644 dist/common/collectd.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 install -m644 dist/common/scylla.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
 install -m644 dist/common/sysctl.d/*.conf $RPM_BUILD_ROOT%{_sysctldir}/
-%if 0%{?rhel}
-install -m644 dist/common/modprobe.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
-%endif
 install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
 install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
@@ -267,18 +265,9 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
-# Write modprobe.d params when module already loaded
-%if 0%{?rhel}
-if [ -e /sys/module/raid0/parameters/devices_discard_performance ]; then
-    echo Y > /sys/module/raid0/parameters/devices_discard_performance
-fi
-%endif

 %files kernel-conf
 %defattr(-,root,root)
-%if 0%{?rhel}
-%config(noreplace) %{_sysconfdir}/modprobe.d/*.conf
-%endif
 %{_sysctldir}/*.conf

 %changelog
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -461,7 +461,8 @@ future<> gossiper::apply_state_locally(std::map<inet_address, endpoint_state> ma
                    int local_generation = local_ep_state_ptr.get_heart_beat_state().get_generation();
                    int remote_generation = remote_state.get_heart_beat_state().get_generation();
                    logger.trace("{} local generation {}, remote generation {}", ep, local_generation, remote_generation);
-                    if (local_generation != 0 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
+                    // A node was removed with nodetool removenode can have a generation of 2
+                    if (local_generation > 2 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
                        // assume some peer has corrupted memory and is broadcasting an unbelievable generation about another peer (or itself)
                        logger.warn("received an invalid gossip generation for peer {}; local generation = {}, received generation = {}",
                            ep, local_generation, remote_generation);
@@ -832,6 +833,7 @@ int gossiper::get_max_endpoint_state_version(endpoint_state state) {

 // Runs inside seastar::async context
 void gossiper::evict_from_membership(inet_address endpoint) {
+    auto permit = lock_endpoint(endpoint).get0();
    _unreachable_endpoints.erase(endpoint);
    container().invoke_on_all([endpoint] (auto& g) {
        g.endpoint_state_map.erase(endpoint);
@@ -982,7 +984,7 @@ future<> gossiper::assassinate_endpoint(sstring address) {
            logger.warn("Assassinating {} via gossip", endpoint);
            if (es) {
                auto& ss = service::get_local_storage_service();
-                auto tokens = ss.get_token_metadata().get_tokens(endpoint);
+                tokens = ss.get_token_metadata().get_tokens(endpoint);
                if (tokens.empty()) {
                    logger.warn("Unable to calculate tokens for {}.  Will use a random one", address);
                    throw std::runtime_error(sprint("Unable to calculate tokens for %s", endpoint));
--- a/locator/ec2_multi_region_snitch.cc
+++ b/locator/ec2_multi_region_snitch.cc
@@ -100,7 +100,6 @@ future<> ec2_multi_region_snitch::gossiper_starting() {
    // Note: currently gossiper "main" instance always runs on CPU0 therefore
    // this function will be executed on CPU0 only.
    //
-    ec2_snitch::gossiper_starting();

    using namespace gms;
    auto& g = get_local_gossiper();
--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -110,7 +110,11 @@ void token_metadata::update_normal_tokens(std::unordered_map<inet_address, std::
        inet_address endpoint = i.first;
        std::unordered_set<token>& tokens = i.second;

-        assert(!tokens.empty());
+        if (tokens.empty()) {
+            auto msg = sprint("tokens is empty in update_normal_tokens");
+            tlogger.error("{}", msg);
+            throw std::runtime_error(msg);
+        }

        for(auto it = _token_to_endpoint_map.begin(), ite = _token_to_endpoint_map.end(); it != ite;) {
            if(it->second == endpoint) {
@@ -141,7 +145,11 @@ void token_metadata::update_normal_tokens(std::unordered_map<inet_address, std::
 }

 size_t token_metadata::first_token_index(const token& start) const {
-    assert(_sorted_tokens.size() > 0);
+    if (_sorted_tokens.empty()) {
+        auto msg = sprint("sorted_tokens is empty in first_token_index!");
+        tlogger.error("{}", msg);
+        throw std::runtime_error(msg);
+    }
    auto it = std::lower_bound(_sorted_tokens.begin(), _sorted_tokens.end(), start);
    if (it == _sorted_tokens.end()) {
        return 0;
@@ -292,7 +300,11 @@ void token_metadata::add_bootstrap_tokens(std::unordered_set<token> tokens, inet
 }

 void token_metadata::remove_bootstrap_tokens(std::unordered_set<token> tokens) {
-    assert(!tokens.empty());
+    if (tokens.empty()) {
+        auto msg = sprint("tokens is empty in remove_bootstrap_tokens!");
+        tlogger.error("{}", msg);
+        throw std::runtime_error(msg);
+    }
    for (auto t : tokens) {
        _bootstrap_tokens.erase(t);
    }
@@ -320,7 +332,11 @@ void token_metadata::remove_from_moving(inet_address endpoint) {
 token token_metadata::get_predecessor(token t) {
    auto& tokens = sorted_tokens();
    auto it = std::lower_bound(tokens.begin(), tokens.end(), t);
-    assert(it != tokens.end() && *it == t);
+    if (it == tokens.end() || *it != t) {
+        auto msg = sprint("token error in get_predecessor!");
+        tlogger.error("{}", msg);
+        throw std::runtime_error(msg);
+    }
    if (it == tokens.begin()) {
        // If the token is the first element, its preprocessor is the last element
        return tokens.back();
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -514,7 +514,6 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }();

    auto remote_addr = ipv4_addr(get_preferred_ip(id.addr).raw_addr(), must_encrypt ? _ssl_port : _port);
-    auto local_addr = ipv4_addr{_listen_address.raw_addr(), 0};

    rpc::client_options opts;
    // send keepalive messages each minute if connection is idle, drop connection after 10 failures
@@ -526,9 +525,9 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge

    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
-                                    remote_addr, local_addr, _credentials) :
+                                    remote_addr, ipv4_addr(), _credentials) :
                    ::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
-                                    remote_addr, local_addr);
+                                    remote_addr);

    it = _clients[idx].emplace(id, shard_info(std::move(client))).first;
    uint32_t src_cpu_id = engine().cpu_id();
@@ -640,59 +639,6 @@ auto send_message_timeout(messaging_service* ms, messaging_verb verb, msg_addr i
    });
 }

-template <typename MsgIn, typename... MsgOut>
-auto send_message_timeout_and_retry(messaging_service* ms, messaging_verb verb, msg_addr id,
-        std::chrono::seconds timeout, int nr_retry, std::chrono::seconds wait, MsgOut... msg) {
-    using MsgInTuple = typename futurize_t<MsgIn>::value_type;
-    return do_with(int(nr_retry), std::move(msg)..., [ms, verb, id, timeout, wait, nr_retry] (auto& retry, const auto&... messages) {
-        return repeat_until_value([ms, verb, id, timeout, wait, nr_retry, &retry, &messages...] {
-            return send_message_timeout<MsgIn>(ms, verb, id, timeout, messages...).then_wrapped(
-                    [ms, verb, id, timeout, wait, nr_retry, &retry] (auto&& f) mutable {
-                auto vb = int(verb);
-                try {
-                    MsgInTuple ret = f.get();
-                    if (retry != nr_retry) {
-                        mlogger.info("Retry verb={} to {}, retry={}: OK", vb, id, retry);
-                    }
-                    return make_ready_future<stdx::optional<MsgInTuple>>(std::move(ret));
-                } catch (rpc::timeout_error) {
-                    mlogger.info("Retry verb={} to {}, retry={}: timeout in {} seconds", vb, id, retry, timeout.count());
-                    throw;
-                } catch (rpc::closed_error) {
-                    mlogger.info("Retry verb={} to {}, retry={}: {}", vb, id, retry, std::current_exception());
-                    // Stop retrying if retry reaches 0 or message service is shutdown
-                    // or the remote node is removed from gossip (on_remove())
-                    retry--;
-                    if (retry == 0) {
-                        mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: retry == 0", vb, id, retry);
-                        throw;
-                    }
-                    if (ms->is_stopping()) {
-                        mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: messaging_service is stopped",
-                                     vb, id, retry);
-                        throw;
-                    }
-                    if (!gms::get_local_gossiper().is_known_endpoint(id.addr)) {
-                        mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: node is removed from the cluster",
-                                     vb, id, retry);
-                        throw;
-                    }
-                    return sleep_abortable(wait).then([] {
-                        return make_ready_future<stdx::optional<MsgInTuple>>(stdx::nullopt);
-                    }).handle_exception([vb, id, retry] (std::exception_ptr ep) {
-                        mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: {}", vb, id, retry, ep);
-                        return make_exception_future<stdx::optional<MsgInTuple>>(ep);
-                    });
-                } catch (...) {
-                    throw;
-                }
-            });
-        }).then([ms = ms->shared_from_this()] (MsgInTuple result) {
-            return futurize<MsgIn>::from_tuple(std::move(result));
-        });
-    });
-}
-
 // Send one way message for verb
 template <typename... MsgOut>
 auto send_message_oneway(messaging_service* ms, messaging_verb verb, msg_addr id, MsgOut&&... msg) {
@@ -707,13 +653,6 @@ auto send_message_oneway_timeout(messaging_service* ms, Timeout timeout, messagi

 // Wrappers for verbs

-// Retransmission parameters for streaming verbs.
-// A stream plan gives up retrying in 10*30 + 10*60 seconds (15 minutes) at
-// most, 10*30 seconds (5 minutes) at least.
-static constexpr int streaming_nr_retry = 10;
-static constexpr std::chrono::seconds streaming_timeout{10*60};
-static constexpr std::chrono::seconds streaming_wait_before_retry{30};
-
 // PREPARE_MESSAGE
 void messaging_service::register_prepare_message(std::function<future<streaming::prepare_message> (const rpc::client_info& cinfo,
        streaming::prepare_message msg, UUID plan_id, sstring description)>&& func) {
@@ -721,8 +660,7 @@ void messaging_service::register_prepare_message(std::function<future<streaming:
 }
 future<streaming::prepare_message> messaging_service::send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
        sstring description) {
-    return send_message_timeout_and_retry<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
-        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
+    return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
        std::move(msg), plan_id, std::move(description));
 }

@@ -731,8 +669,7 @@ void messaging_service::register_prepare_done_message(std::function<future<> (co
    register_handler(this, messaging_verb::PREPARE_DONE_MESSAGE, std::move(func));
 }
 future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id) {
-    return send_message_timeout_and_retry<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
-        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
+    return send_message<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
        plan_id, dst_cpu_id);
 }

@@ -741,8 +678,7 @@ void messaging_service::register_stream_mutation(std::function<future<> (const r
    register_handler(this, messaging_verb::STREAM_MUTATION, std::move(func));
 }
 future<> messaging_service::send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented) {
-    return send_message_timeout_and_retry<void>(this, messaging_verb::STREAM_MUTATION, id,
-        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
+    return send_message<void>(this, messaging_verb::STREAM_MUTATION, id,
        plan_id, std::move(fm), dst_cpu_id, fragmented);
 }

@@ -757,19 +693,17 @@ void messaging_service::register_stream_mutation_done(std::function<future<> (co
    });
 }
 future<> messaging_service::send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id) {
-    return send_message_timeout_and_retry<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
-        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
+    return send_message<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
        plan_id, std::move(ranges), cf_id, dst_cpu_id);
 }

 // COMPLETE_MESSAGE
-void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
+void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func) {
    register_handler(this, messaging_verb::COMPLETE_MESSAGE, std::move(func));
 }
-future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id) {
-    return send_message_timeout_and_retry<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
-        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
-        plan_id, dst_cpu_id);
+future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed) {
+    return send_message<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
+        plan_id, dst_cpu_id, failed);
 }

 void messaging_service::register_gossip_echo(std::function<future<> ()>&& func) {
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -249,8 +249,8 @@ public:
    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);

-    void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
-    future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
+    void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func);
+    future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed = false);

    // Wrapper for REPAIR_CHECKSUM_RANGE verb
    void register_repair_checksum_range(std::function<future<partition_checksum> (sstring keyspace, sstring cf, dht::token_range range, rpc::optional<repair_checksum> hash_version)>&& func);
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -545,13 +545,19 @@ lw_shared_ptr<partition_snapshot> partition_entry::read(schema_ptr entry_schema,
 std::vector<range_tombstone>
 partition_snapshot::range_tombstones(const schema& s, position_in_partition_view start, position_in_partition_view end)
 {
+    partition_version* v = &*version();
+    if (!v->next()) {
+        return boost::copy_range<std::vector<range_tombstone>>(
+            v->partition().row_tombstones().slice(s, start, end));
+    }
    range_tombstone_list list(s);
-    for (auto&& v : versions()) {
-        for (auto&& rt : v.partition().row_tombstones().slice(s, start, end)) {
+    while (v) {
+        for (auto&& rt : v->partition().row_tombstones().slice(s, start, end)) {
            list.apply(s, rt);
        }
+        v = v->next();
    }
-    return boost::copy_range<std::vector<range_tombstone>>(list);
+    return boost::copy_range<std::vector<range_tombstone>>(list.slice(s, start, end));
 }

 std::ostream& operator<<(std::ostream& out, partition_entry& e) {
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -124,6 +124,7 @@ void range_tombstone_list::insert_from(const schema& s,
            if (less(end_bound, it->end_bound())) {
                end = it->end;
                end_kind = it->end_kind;
+                end_bound = bound_view(end, end_kind);
            }
            it = rev.erase(it);
        } else if (c > 0) {
--- a/2
+++ b/2
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -87,6 +87,7 @@ static const sstring COUNTERS_FEATURE = "COUNTERS";
 static const sstring INDEXES_FEATURE = "INDEXES";
 static const sstring CORRECT_COUNTER_ORDER_FEATURE = "CORRECT_COUNTER_ORDER";
 static const sstring SCHEMA_TABLES_V3 = "SCHEMA_TABLES_V3";
+static const sstring CORRECT_NON_COMPOUND_RANGE_TOMBSTONES = "CORRECT_NON_COMPOUND_RANGE_TOMBSTONES";

 distributed<storage_service> _the_storage_service;

@@ -129,7 +130,8 @@ sstring storage_service::get_config_supported_features() {
        LARGE_PARTITIONS_FEATURE,
        COUNTERS_FEATURE,
        CORRECT_COUNTER_ORDER_FEATURE,
-        SCHEMA_TABLES_V3
+        SCHEMA_TABLES_V3,
+        CORRECT_NON_COMPOUND_RANGE_TOMBSTONES,
    };
    if (service::get_local_storage_service()._db.local().get_config().experimental()) {
        features.push_back(MATERIALIZED_VIEWS_FEATURE);
@@ -339,6 +341,7 @@ void storage_service::register_features() {
    _counters_feature = gms::feature(COUNTERS_FEATURE);
    _correct_counter_order_feature = gms::feature(CORRECT_COUNTER_ORDER_FEATURE);
    _schema_tables_v3 = gms::feature(SCHEMA_TABLES_V3);
+    _correct_non_compound_range_tombstones = gms::feature(CORRECT_NON_COMPOUND_RANGE_TOMBSTONES);

    if (_db.local().get_config().experimental()) {
        _materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
@@ -926,7 +929,17 @@ void storage_service::handle_state_removing(inet_address endpoint, std::vector<s
                slogger.warn("{}", err);
                throw std::runtime_error(err);
            }
-            restore_replica_count(endpoint, ep.value()).get();
+            // Kick off streaming commands. No need to wait for
+            // restore_replica_count to complete which can take a long time,
+            // since when it completes, this node will send notification to
+            // tell the removal_coordinator with IP address notify_endpoint
+            // that the restore process is finished on this node. This node
+            // will be removed from _replicating_nodes on the
+            // removal_coordinator.
+            auto notify_endpoint = ep.value();
+            restore_replica_count(endpoint, notify_endpoint).handle_exception([endpoint, notify_endpoint] (auto ep) {
+                slogger.info("Failed to restore_replica_count for node {}, notify_endpoint={} : {}", endpoint, notify_endpoint, ep);
+            });
        }
    } else { // now that the gossiper has told us about this nonexistent member, notify the gossiper to remove it
        if (sstring(gms::versioned_value::REMOVED_TOKEN) == pieces[0]) {
@@ -978,6 +991,7 @@ void storage_service::on_change(inet_address endpoint, application_state state,
        boost::split(pieces, value.value, boost::is_any_of(sstring(versioned_value::DELIMITER_STR)));
        if (pieces.empty()) {
            slogger.warn("Fail to split status in on_change: endpoint={}, app_state={}, value={}", endpoint, state, value);
+            return;
        }
        sstring move_name = pieces[0];
        if (move_name == sstring(versioned_value::STATUS_BOOTSTRAPPING)) {
@@ -1026,8 +1040,8 @@ void storage_service::on_remove(gms::inet_address endpoint) {

 void storage_service::on_dead(gms::inet_address endpoint, gms::endpoint_state state) {
    slogger.debug("endpoint={} on_dead", endpoint);
-    netw::get_local_messaging_service().remove_rpc_client(netw::msg_addr{endpoint, 0});
    get_storage_service().invoke_on_all([endpoint] (auto&& ss) {
+        netw::get_local_messaging_service().remove_rpc_client(netw::msg_addr{endpoint, 0});
        for (auto&& subscriber : ss._lifecycle_subscribers) {
            try {
                subscriber->on_down(endpoint);
@@ -2345,15 +2359,12 @@ future<> storage_service::rebuild(sstring source_dc) {
        for (const auto& keyspace_name : ss._db.local().get_non_system_keyspaces()) {
            streamer->add_ranges(keyspace_name, ss.get_local_ranges(keyspace_name));
        }
-        return streamer->fetch_async().then_wrapped([streamer] (auto&& f) {
-            try {
-                auto state = f.get0();
-            } catch (...) {
-                // This is used exclusively through JMX, so log the full trace but only throw a simple RTE
-                slogger.error("Error while rebuilding node: {}", std::current_exception());
-                throw std::runtime_error(sprint("Error while rebuilding node: %s", std::current_exception()));
-            }
-            return make_ready_future<>();
+        return streamer->stream_async().then([streamer] {
+            slogger.info("Streaming for rebuild successful");
+        }).handle_exception([] (auto ep) {
+            // This is used exclusively through JMX, so log the full trace but only throw a simple RTE
+            slogger.warn("Error while rebuilding node: {}", std::current_exception());
+            return make_exception_future<>(std::move(ep));
        });
    });
 }
@@ -2480,10 +2491,8 @@ void storage_service::unbootstrap() {
 }

 future<> storage_service::restore_replica_count(inet_address endpoint, inet_address notify_endpoint) {
-    std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> ranges_to_fetch;
-
+    auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Restore_replica_count");
    auto my_address = get_broadcast_address();
-
    auto non_system_keyspaces = _db.local().get_non_system_keyspaces();
    for (const auto& keyspace_name : non_system_keyspaces) {
        std::unordered_multimap<dht::token_range, inet_address> changed_ranges = get_changed_ranges_for_leaving(keyspace_name, endpoint);
@@ -2494,26 +2503,15 @@ future<> storage_service::restore_replica_count(inet_address endpoint, inet_addr
            }
        }
        std::unordered_multimap<inet_address, dht::token_range> source_ranges = get_new_source_ranges(keyspace_name, my_new_ranges);
-        std::unordered_map<inet_address, dht::token_range_vector> tmp;
+        std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint;
        for (auto& x : source_ranges) {
-            tmp[x.first].emplace_back(x.second);
+            ranges_per_endpoint[x.first].emplace_back(x.second);
        }
-        ranges_to_fetch.emplace(keyspace_name, std::move(tmp));
+        streamer->add_rx_ranges(keyspace_name, std::move(ranges_per_endpoint));
    }
-    auto sp = make_lw_shared<streaming::stream_plan>("Restore replica count");
-    for (auto& x: ranges_to_fetch) {
-        const sstring& keyspace_name = x.first;
-        std::unordered_map<inet_address, dht::token_range_vector>& maps = x.second;
-        for (auto& m : maps) {
-            auto source = m.first;
-            auto ranges = m.second;
-            slogger.debug("Requesting from {} ranges {}", source, ranges);
-            sp->request_ranges(source, keyspace_name, ranges);
-        }
-    }
-    return sp->execute().then_wrapped([this, sp, notify_endpoint] (auto&& f) {
+    return streamer->stream_async().then_wrapped([this, streamer, notify_endpoint] (auto&& f) {
        try {
-            auto state = f.get0();
+            f.get();
            return this->send_replication_notification(notify_endpoint);
        } catch (...) {
            slogger.warn("Streaming to restore replica count failed: {}", std::current_exception());
@@ -2605,8 +2603,7 @@ void storage_service::leave_ring() {

 future<>
 storage_service::stream_ranges(std::unordered_map<sstring, std::unordered_multimap<dht::token_range, inet_address>> ranges_to_stream_by_keyspace) {
-    // First, we build a list of ranges to stream to each host, per table
-    std::unordered_map<sstring, std::unordered_map<inet_address, dht::token_range_vector>> sessions_to_stream_by_keyspace;
+    auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Unbootstrap");
    for (auto& entry : ranges_to_stream_by_keyspace) {
        const auto& keyspace = entry.first;
        auto& ranges_with_endpoints = entry.second;
@@ -2621,26 +2618,13 @@ storage_service::stream_ranges(std::unordered_map<sstring, std::unordered_multim
            inet_address endpoint = end_point_entry.second;
            ranges_per_endpoint[endpoint].emplace_back(r);
        }
-        sessions_to_stream_by_keyspace.emplace(keyspace, std::move(ranges_per_endpoint));
+        streamer->add_tx_ranges(keyspace, std::move(ranges_per_endpoint));
    }
-    auto sp = make_lw_shared<streaming::stream_plan>("Unbootstrap");
-    for (auto& entry : sessions_to_stream_by_keyspace) {
-        const auto& keyspace_name = entry.first;
-        // TODO: we can move to avoid copy of std::vector
-        auto& ranges_per_endpoint = entry.second;
-
-        for (auto& ranges_entry : ranges_per_endpoint) {
-            auto& ranges = ranges_entry.second;
-            auto new_endpoint = ranges_entry.first;
-            // TODO each call to transferRanges re-flushes, this is potentially a lot of waste
-            sp->transfer_ranges(new_endpoint, keyspace_name, ranges);
-        }
-    }
-    return sp->execute().discard_result().then([sp] {
+    return streamer->stream_async().then([streamer] {
        slogger.info("stream_ranges successful");
    }).handle_exception([] (auto ep) {
-        slogger.info("stream_ranges failed: {}", ep);
-        return make_exception_future(std::runtime_error("stream_ranges failed"));
+        slogger.warn("stream_ranges failed: {}", ep);
+        return make_exception_future<>(std::move(ep));
    });
 }

@@ -2674,16 +2658,18 @@ future<> storage_service::stream_hints() {
        // stream all hints -- range list will be a singleton of "the entire ring"
        dht::token_range_vector ranges = {dht::token_range::make_open_ended_both_sides()};
        slogger.debug("stream_hints: ranges={}", ranges);
+        std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint;
+        ranges_per_endpoint[hints_destination_host] = std::move(ranges);

-        auto sp = make_lw_shared<streaming::stream_plan>("Hints");
-        std::vector<sstring> column_families = { db::system_keyspace::HINTS };
+        auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Hints");
        auto keyspace = db::system_keyspace::NAME;
-        sp->transfer_ranges(hints_destination_host, keyspace, ranges, column_families);
-        return sp->execute().discard_result().then([sp] {
+        std::vector<sstring> column_families = { db::system_keyspace::HINTS };
+        streamer->add_tx_ranges(keyspace, std::move(ranges_per_endpoint), column_families);
+        return streamer->stream_async().then([streamer] {
            slogger.info("stream_hints successful");
        }).handle_exception([] (auto ep) {
-            slogger.info("stream_hints failed: {}", ep);
-            return make_exception_future(std::runtime_error("stream_hints failed"));
+            slogger.warn("stream_hints failed: {}", ep);
+            return make_exception_future<>(std::move(ep));
        });
    }
 }
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -266,6 +266,7 @@ private:
    gms::feature _indexes_feature;
    gms::feature _correct_counter_order_feature;
    gms::feature _schema_tables_v3;
+    gms::feature _correct_non_compound_range_tombstones;
 public:
    void enable_all_features() {
        _range_tombstones_feature.enable();
@@ -275,6 +276,7 @@ public:
        _indexes_feature.enable();
        _correct_counter_order_feature.enable();
        _schema_tables_v3.enable();
+        _correct_non_compound_range_tombstones.enable();
    }

    void finish_bootstrapping() {
@@ -2236,6 +2238,10 @@ public:
    const gms::feature& cluster_supports_schema_tables_v3() const {
        return _schema_tables_v3;
    }
+
+    bool cluster_supports_reading_correctly_serialized_range_tombstones() const {
+        return bool(_correct_non_compound_range_tombstones);
+    }
 };

 inline future<> init_storage_service(distributed<database>& db) {
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -24,6 +24,7 @@
 #include "consumer.hh"
 #include "downsampling.hh"
 #include "sstables/shared_index_lists.hh"
+#include <seastar/util/bool_class.hh>

 namespace sstables {

@@ -47,12 +48,16 @@ public:
    }
 };

+// See #2993
+class trust_promoted_index_tag;
+using trust_promoted_index = bool_class<trust_promoted_index_tag>;
+
 // IndexConsumer is a concept that implements:
 //
 // bool should_continue();
 // void consume_entry(index_entry&& ie, uintt64_t offset);
 template <class IndexConsumer>
-class index_consume_entry_context: public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
+class index_consume_entry_context : public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
    using proceed = data_consumer::proceed;
    using continuous_data_consumer = data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>>;
 private:
@@ -72,6 +77,8 @@ private:
    temporary_buffer<char> _key;
    temporary_buffer<char> _promoted;

+    trust_promoted_index _trust_pi;
+
 public:
    void verify_end_state() {
    }
@@ -117,6 +124,9 @@ public:
            }
        case state::CONSUME_ENTRY: {
            auto len = (_key.size() + _promoted.size() + 14);
+            if (_trust_pi == trust_promoted_index::no) {
+                _promoted = temporary_buffer<char>();
+            }
            _consumer.consume_entry(index_entry(std::move(_key), this->_u64, std::move(_promoted)), _entry_offset);
            _entry_offset += len;
            _state = state::START;
@@ -128,10 +138,10 @@ public:
        return proceed::yes;
    }

-    index_consume_entry_context(IndexConsumer& consumer,
+    index_consume_entry_context(IndexConsumer& consumer, trust_promoted_index trust_pi,
            input_stream<char>&& input, uint64_t start, uint64_t maxlen)
        : continuous_data_consumer(std::move(input), start, maxlen)
-        , _consumer(consumer), _entry_offset(start)
+        , _consumer(consumer), _entry_offset(start), _trust_pi(trust_pi)
    {}

    void reset(uint64_t offset) {
@@ -196,7 +206,9 @@ class index_reader {

        reader(shared_sstable sst, const io_priority_class& pc, uint64_t begin, uint64_t end, uint64_t quantity)
            : _consumer(quantity)
-            , _context(_consumer, create_file_input_stream(sst, pc, begin, end), begin, end - begin)
+            , _context(_consumer,
+                       trust_promoted_index(sst->has_correct_promoted_index_entries()),
+                       create_file_input_stream(sst, pc, begin, end), begin, end - begin)
        { }
    };

@@ -514,6 +526,7 @@ public:
        }
        if (_current_index_idx + 1 < _current_list->size()) {
            ++_current_index_idx;
+            _current_pi_idx = 0;
            _data_file_position = (*_current_list)[_current_index_idx].position();
            _element = indexable_element::partition;
            return make_ready_future<>();
--- a/sstables/partition.cc
+++ b/sstables/partition.cc
@@ -97,6 +97,9 @@ private:
    // _range_tombstones holds only tombstones which are relevant for current ranges.
    range_tombstone_stream _range_tombstones;
    bool _first_row_encountered = false;
+
+    // See #2986
+    bool _treat_non_compound_rt_as_compound;
 public:
    void set_streamed_mutation(sstable_streamed_mutation* sm) {
        _sm = sm;
@@ -306,18 +309,21 @@ public:
    mp_row_consumer(const schema_ptr schema,
                    const query::partition_slice& slice,
                    const io_priority_class& pc,
-                    streamed_mutation::forwarding fwd)
+                    streamed_mutation::forwarding fwd,
+                    const shared_sstable& sst)
            : _schema(schema)
            , _pc(pc)
            , _slice(slice)
            , _fwd(fwd)
            , _range_tombstones(*_schema)
+            , _treat_non_compound_rt_as_compound(!sst->has_correct_non_compound_range_tombstones())
    { }

    mp_row_consumer(const schema_ptr schema,
                    const io_priority_class& pc,
-                    streamed_mutation::forwarding fwd)
-            : mp_row_consumer(schema, query::full_slice, pc, fwd) { }
+                    streamed_mutation::forwarding fwd,
+                    const shared_sstable& sst)
+            : mp_row_consumer(schema, query::full_slice, pc, fwd, sst) { }

    virtual proceed consume_row_start(sstables::key_view key, sstables::deletion_time deltime) override {
        if (!_is_mutation_end) {
@@ -621,7 +627,8 @@ public:
            return proceed::yes;
        }

-        auto start = composite_view(column::fix_static_name(*_schema, start_col)).explode();
+        auto compound = _schema->is_compound() || _treat_non_compound_rt_as_compound;
+        auto start = composite_view(column::fix_static_name(*_schema, start_col), compound).explode();

        // Note how this is slightly different from the check in is_collection. Collection tombstones
        // do not have extra data.
@@ -630,9 +637,9 @@ public:
        // won't have a full clustering prefix (otherwise it isn't a range)
        if (start.size() <= _schema->clustering_key_size()) {
            auto start_ck = clustering_key_prefix::from_exploded_view(start);
-            auto start_kind = start_marker_to_bound_kind(start_col);
-            auto end = clustering_key_prefix::from_exploded_view(composite_view(column::fix_static_name(*_schema, end_col)).explode());
-            auto end_kind = end_marker_to_bound_kind(end_col);
+            auto start_kind = compound ? start_marker_to_bound_kind(start_col) : bound_kind::incl_start;
+            auto end = clustering_key_prefix::from_exploded_view(composite_view(column::fix_static_name(*_schema, end_col), compound).explode());
+            auto end_kind = compound ? end_marker_to_bound_kind(end_col) : bound_kind::incl_end;
            if (range_tombstone::is_single_clustering_row_tombstone(*_schema, start_ck, start_kind, end, end_kind)) {
                auto ret = flush_if_needed(std::move(start_ck));
                if (!_skip_in_progress) {
@@ -1050,7 +1057,7 @@ public:
         const io_priority_class &pc,
         streamed_mutation::forwarding fwd)
        : _get_data_source([this, sst = std::move(sst), s = std::move(schema), toread, last_end, &pc, fwd] {
-            auto consumer = mp_row_consumer(s, query::full_slice, pc, fwd);
+            auto consumer = mp_row_consumer(s, query::full_slice, pc, fwd, sst);
            auto ds = make_lw_shared<sstable_data_source>(std::move(s), std::move(sst), std::move(consumer), std::move(toread), last_end);
            return make_ready_future<lw_shared_ptr<sstable_data_source>>(std::move(ds));
        }) { }
@@ -1058,7 +1065,7 @@ public:
         const io_priority_class &pc,
         streamed_mutation::forwarding fwd)
        : _get_data_source([this, sst = std::move(sst), s = std::move(schema), &pc, fwd] {
-            auto consumer = mp_row_consumer(s, query::full_slice, pc, fwd);
+            auto consumer = mp_row_consumer(s, query::full_slice, pc, fwd, sst);
            auto ds = make_lw_shared<sstable_data_source>(std::move(s), std::move(sst), std::move(consumer));
            return make_ready_future<lw_shared_ptr<sstable_data_source>>(std::move(ds));
        }) { }
@@ -1076,7 +1083,7 @@ public:
            return f.then([this, lh_index = std::move(lh_index), rh_index = std::move(rh_index), sst = std::move(sst), s = std::move(s), &pc, &slice, fwd, fwd_mr] () mutable {
                sstable::disk_read_range drr{lh_index->data_file_position(),
                                             rh_index->data_file_position()};
-                auto consumer = mp_row_consumer(s, slice, pc, fwd);
+                auto consumer = mp_row_consumer(s, slice, pc, fwd, sst);
                auto ds = make_lw_shared<sstable_data_source>(std::move(s), std::move(sst), std::move(consumer), drr, (fwd_mr ? sst->data_size() : drr.end), std::move(lh_index), std::move(rh_index));
                ds->_index_in_current_partition = true;
                ds->_will_likely_slice = sstable_data_source::will_likely_slice(slice);
@@ -1271,7 +1278,7 @@ sstables::sstable::read_row(schema_ptr schema,
        auto rh_index = std::make_unique<index_reader>(*lh_index);
        auto f = advance_to_upper_bound(*rh_index, *_schema, slice, key);
        return f.then([this, &slice, &pc, fwd, lh_index = std::move(lh_index), rh_index = std::move(rh_index), s = std::move(s)] () mutable {
-            auto consumer = mp_row_consumer(s, slice, pc, fwd);
+            auto consumer = mp_row_consumer(s, slice, pc, fwd, shared_from_this());
            auto ds = make_lw_shared<sstable_data_source>(sstable_data_source::single_partition_tag(), std::move(s),
                shared_from_this(), std::move(consumer), std::move(lh_index), std::move(rh_index));
            ds->_will_likely_slice = sstable_data_source::will_likely_slice(slice);
--- a/sstables/shared_index_lists.hh
+++ b/sstables/shared_index_lists.hh
@@ -21,10 +21,9 @@

 #pragma once

-#include <unordered_map>
 #include <vector>
-#include <seastar/core/shared_future.hh>
 #include <seastar/core/future.hh>
+#include "utils/loading_shared_values.hh"

 namespace sstables {

@@ -36,50 +35,26 @@ using index_list = std::vector<index_entry>;
 class shared_index_lists {
 public:
    using key_type = uint64_t;
-    struct stats {
+    static thread_local struct stats {
        uint64_t hits = 0; // Number of times entry was found ready
        uint64_t misses = 0; // Number of times entry was not found
        uint64_t blocks = 0; // Number of times entry was not ready (>= misses)
-    };
-private:
-    class entry : public enable_lw_shared_from_this<entry> {
-    public:
-        key_type key;
-        index_list list;
-        shared_promise<> loaded;
-        shared_index_lists& parent;
+    } _shard_stats;

-        entry(shared_index_lists& parent, key_type key)
-            : key(key), parent(parent)
-        { }
-        ~entry() {
-            parent._lists.erase(key);
-        }
-        bool operator==(const entry& e) const { return key == e.key; }
-        bool operator!=(const entry& e) const { return key != e.key; }
+    struct stats_updater {
+        static void inc_hits() noexcept { ++_shard_stats.hits; }
+        static void inc_misses() noexcept { ++_shard_stats.misses; }
+        static void inc_blocks() noexcept { ++_shard_stats.blocks; }
+        static void inc_evictions() noexcept {}
    };
-    std::unordered_map<key_type, entry*> _lists;
-    static thread_local stats _shard_stats;
-public:
+
+    using loading_shared_lists_type = utils::loading_shared_values<key_type, index_list, std::hash<key_type>, std::equal_to<key_type>, stats_updater>;
    // Pointer to index_list
-    class list_ptr {
-        lw_shared_ptr<entry> _e;
-    public:
-        using element_type = index_list;
-        list_ptr() = default;
-        explicit list_ptr(lw_shared_ptr<entry> e) : _e(std::move(e)) {}
-        explicit operator bool() const { return static_cast<bool>(_e); }
-        index_list& operator*() { return _e->list; }
-        const index_list& operator*() const { return _e->list; }
-        index_list* operator->() { return &_e->list; }
-        const index_list* operator->() const { return &_e->list; }
+    using list_ptr = loading_shared_lists_type::entry_ptr;
+private:

-        index_list release() {
-            auto res = _e.owned() ? index_list(std::move(_e->list)) : index_list(_e->list);
-            _e = {};
-            return std::move(res);
-        }
-    };
+    loading_shared_lists_type _lists;
+public:

    shared_index_lists() = default;
    shared_index_lists(shared_index_lists&&) = delete;
@@ -93,41 +68,8 @@ public:
    //
    // The loader object does not survive deferring, so the caller must deal with its liveness.
    template<typename Loader>
-    future<list_ptr> get_or_load(key_type key, Loader&& loader) {
-        auto i = _lists.find(key);
-        lw_shared_ptr<entry> e;
-        auto f = [&] {
-            if (i != _lists.end()) {
-                e = i->second->shared_from_this();
-                return e->loaded.get_shared_future();
-            } else {
-                ++_shard_stats.misses;
-                e = make_lw_shared<entry>(*this, key);
-                auto f = e->loaded.get_shared_future();
-                auto res = _lists.emplace(key, e.get());
-                assert(res.second);
-                futurize_apply(loader, key).then_wrapped([e](future<index_list>&& f) mutable {
-                    if (f.failed()) {
-                        e->loaded.set_exception(f.get_exception());
-                    } else {
-                        e->list = f.get0();
-                        e->loaded.set_value();
-                    }
-                });
-                return f;
-            }
-        }();
-        if (!f.available()) {
-            ++_shard_stats.blocks;
-            return f.then([e]() mutable {
-                return list_ptr(std::move(e));
-            });
-        } else if (f.failed()) {
-            return make_exception_future<list_ptr>(std::move(f).get_exception());
-        } else {
-            ++_shard_stats.hits;
-            return make_ready_future<list_ptr>(list_ptr(std::move(e)));
-        }
+    future<list_ptr> get_or_load(const key_type& key, Loader&& loader) {
+        return _lists.get_or_load(key, std::forward<Loader>(loader));
    }

    static const stats& shard_stats() { return _shard_stats; }
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -156,6 +156,12 @@ public:
    }
 };

+shared_sstable
+make_sstable(schema_ptr schema, sstring dir, int64_t generation, sstable::version_types v, sstable::format_types f, gc_clock::time_point now,
+            io_error_handler_gen error_handler_gen, size_t buffer_size) {
+    return make_lw_shared<sstable>(std::move(schema), std::move(dir), generation, v, f, now, std::move(error_handler_gen), buffer_size);
+}
+
 std::unordered_map<sstable::version_types, sstring, enum_hash<sstable::version_types>> sstable::_version_string = {
    { sstable::version_types::ka , "ka" },
    { sstable::version_types::la , "la" }
@@ -1279,6 +1285,110 @@ static composite::eoc bound_kind_to_end_marker(bound_kind end_kind) {
         : composite::eoc::end;
 }

+class bytes_writer_for_column_name {
+    bytes _buf;
+    bytes::iterator _pos;
+public:
+    void prepare(size_t size) {
+        _buf = bytes(bytes::initialized_later(), size);
+        _pos = _buf.begin();
+    }
+
+    template<typename... Args>
+    void write(Args&&... args) {
+        auto write_one = [this] (bytes_view data) {
+            _pos = std::copy(data.begin(), data.end(), _pos);
+        };
+        auto ignore = { (write_one(bytes_view(args)), 0)... };
+        (void)ignore;
+    }
+
+    bytes&& release() && {
+        return std::move(_buf);
+    }
+};
+
+class file_writer_for_column_name {
+    file_writer& _fw;
+public:
+    file_writer_for_column_name(file_writer& fw) : _fw(fw) { }
+
+    void prepare(uint16_t size) {
+        sstables::write(_fw, size);
+    }
+
+    template<typename... Args>
+    void write(Args&&... args) {
+        sstables::write(_fw, std::forward<Args>(args)...);
+    }
+};
+
+template<typename Writer>
+static void write_compound_non_dense_column_name(Writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none) {
+    // was defined in the schema, for example.
+    auto c = composite::from_exploded(column_names, true, marker);
+    auto ck_bview = bytes_view(clustering_key);
+
+    // The marker is not a component, so if the last component is empty (IOW,
+    // only serializes to the marker), then we just replace the key's last byte
+    // with the marker. If the component however it is not empty, then the
+    // marker should be in the end of it, and we just join them together as we
+    // do for any normal component
+    if (c.size() == 1) {
+        ck_bview.remove_suffix(1);
+    }
+    size_t sz = ck_bview.size() + c.size();
+    if (sz > std::numeric_limits<uint16_t>::max()) {
+        throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
+    }
+    out.prepare(uint16_t(sz));
+    out.write(ck_bview, c);
+}
+
+static void write_compound_non_dense_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none) {
+    auto w = file_writer_for_column_name(out);
+    write_compound_non_dense_column_name(w, clustering_key, column_names, marker);
+}
+
+template<typename Writer>
+static void write_column_name(Writer& out, bytes_view column_names) {
+    size_t sz = column_names.size();
+    if (sz > std::numeric_limits<uint16_t>::max()) {
+        throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
+    }
+    out.prepare(uint16_t(sz));
+    out.write(column_names);
+}
+
+static void write_column_name(file_writer& out, bytes_view column_names) {
+    auto w = file_writer_for_column_name(out);
+    write_column_name(w, column_names);
+}
+
+template<typename Writer>
+static void write_column_name(Writer& out, const schema& s, const composite& clustering_element, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none) {
+    if (s.is_dense()) {
+        write_column_name(out, bytes_view(clustering_element));
+    } else if (s.is_compound()) {
+        write_compound_non_dense_column_name(out, clustering_element, column_names, marker);
+    } else {
+        write_column_name(out, column_names[0]);
+    }
+}
+
+void sstable::write_range_tombstone_bound(file_writer& out,
+        const schema& s,
+        const composite& clustering_element,
+        const std::vector<bytes_view>& column_names,
+        composite::eoc marker) {
+    if (!_correctly_serialize_non_compound_range_tombstones && !clustering_element.is_compound()) {
+        auto vals = clustering_element.values();
+        write_compound_non_dense_column_name(out, composite::serialize_value(vals, true), column_names, marker);
+    } else {
+        write_column_name(out, s, clustering_element, column_names, marker);
+    }
+}
+
 static void output_promoted_index_entry(bytes_ostream& promoted_index,
        const bytes& first_col,
        const bytes& last_col,
@@ -1297,29 +1407,6 @@ static void output_promoted_index_entry(bytes_ostream& promoted_index,
    promoted_index.write(q, 8);
 }

-// FIXME: use this in write_column_name() instead of repeating the code
-static bytes serialize_colname(const composite& clustering_key,
-        const std::vector<bytes_view>& column_names, composite::eoc marker) {
-    auto c = composite::from_exploded(column_names, marker);
-    auto ck_bview = bytes_view(clustering_key);
-    // The marker is not a component, so if the last component is empty (IOW,
-    // only serializes to the marker), then we just replace the key's last byte
-    // with the marker. If the component however it is not empty, then the
-    // marker should be in the end of it, and we just join them together as we
-    // do for any normal component
-    if (c.size() == 1) {
-        ck_bview.remove_suffix(1);
-    }
-    size_t sz = ck_bview.size() + c.size();
-    if (sz > std::numeric_limits<uint16_t>::max()) {
-        throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
-    }
-    bytes colname(bytes::initialized_later(), sz);
-    std::copy(ck_bview.begin(), ck_bview.end(), colname.begin());
-    std::copy(c.get_bytes().begin(), c.get_bytes().end(), colname.begin() + ck_bview.size());
-    return colname;
-}
-
 // Call maybe_flush_pi_block() before writing the given sstable atom to the
 // output. This may start a new promoted-index block depending on how much
 // data we've already written since the start of the current block. Starting
@@ -1337,7 +1424,18 @@ void sstable::maybe_flush_pi_block(file_writer& out,
        const composite& clustering_key,
        const std::vector<bytes_view>& column_names,
        composite::eoc marker) {
-    bytes colname = serialize_colname(clustering_key, column_names, marker);
+    if (!_schema->clustering_key_size()) {
+        return;
+    }
+    bytes_writer_for_column_name w;
+    write_column_name(w, *_schema, clustering_key, column_names, marker);
+    maybe_flush_pi_block(out, clustering_key, std::move(w).release());
+}
+
+// Overload can only be called if the schema has clustering keys.
+void sstable::maybe_flush_pi_block(file_writer& out,
+        const composite& clustering_key,
+        bytes colname) {
    if (_pi_write.block_first_colname.empty()) {
        // This is the first column in the partition, or first column since we
        // closed a promoted-index block. Remember its name and position -
@@ -1362,17 +1460,15 @@ void sstable::maybe_flush_pi_block(file_writer& out,
        // block includes them), but we set block_next_start_offset after - so
        // even if we wrote a lot of open tombstones, we still get a full
        // block size of new data.
-        if (!clustering_key.empty()) {
-            auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
-                    clustering_key_prefix::from_range(clustering_key.values()));
-            for (const auto& rt : rts) {
-                auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
-                auto end = composite::from_clustering_element(*_pi_write.schemap, rt.end);
-                write_range_tombstone(out,
-                        start, bound_kind_to_start_marker(rt.start_kind),
-                        end, bound_kind_to_end_marker(rt.end_kind),
-                        {}, rt.tomb);
-            }
+        auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
+                clustering_key_prefix::from_range(clustering_key.values()));
+        for (const auto& rt : rts) {
+            auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
+            auto end = composite::from_clustering_element(*_pi_write.schemap, rt.end);
+            write_range_tombstone(out,
+                    start, bound_kind_to_start_marker(rt.start_kind),
+                    end, bound_kind_to_end_marker(rt.end_kind),
+                    {}, rt.tomb);
        }
        _pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
        _pi_write.block_first_colname = colname;
@@ -1384,37 +1480,6 @@ void sstable::maybe_flush_pi_block(file_writer& out,
    }
 }

-void sstable::write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker) {
-    // was defined in the schema, for example.
-    auto c = composite::from_exploded(column_names, marker);
-    auto ck_bview = bytes_view(clustering_key);
-
-    // The marker is not a component, so if the last component is empty (IOW,
-    // only serializes to the marker), then we just replace the key's last byte
-    // with the marker. If the component however it is not empty, then the
-    // marker should be in the end of it, and we just join them together as we
-    // do for any normal component
-    if (c.size() == 1) {
-        ck_bview.remove_suffix(1);
-    }
-    size_t sz = ck_bview.size() + c.size();
-    if (sz > std::numeric_limits<uint16_t>::max()) {
-        throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
-    }
-    uint16_t sz16 = sz;
-    write(out, sz16, ck_bview, c);
-}
-
-void sstable::write_column_name(file_writer& out, bytes_view column_names) {
-    size_t sz = column_names.size();
-    if (sz > std::numeric_limits<uint16_t>::max()) {
-        throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
-    }
-    uint16_t sz16 = sz;
-    write(out, sz16, column_names);
-}
-
-
 static inline void update_cell_stats(column_stats& c_stats, uint64_t timestamp) {
    c_stats.update_min_timestamp(timestamp);
    c_stats.update_max_timestamp(timestamp);
@@ -1496,13 +1561,12 @@ void sstable::write_cell(file_writer& out, atomic_cell_view cell, const column_d
    }
 }

-void sstable::write_row_marker(file_writer& out, const row_marker& marker, const composite& clustering_key) {
-    if (marker.is_missing()) {
+void sstable::maybe_write_row_marker(file_writer& out, const schema& schema, const row_marker& marker, const composite& clustering_key) {
+    if (!schema.is_compound() || schema.is_dense() || marker.is_missing()) {
        return;
    }
-
    // Write row mark cell to the beginning of clustered row.
-    write_column_name(out, clustering_key, { bytes_view() });
+    index_and_write_column_name(out, clustering_key, { bytes_view() });
    uint64_t timestamp = marker.timestamp();
    uint32_t value_length = 0;

@@ -1538,21 +1602,25 @@ void sstable::write_deletion_time(file_writer& out, const tombstone t) {
    write(out, deletion_time, timestamp);
 }

-void sstable::write_row_tombstone(file_writer& out, const composite& key, const row_tombstone t) {
+void sstable::index_tombstone(file_writer& out, const composite& key, range_tombstone&& rt, composite::eoc marker) {
+    maybe_flush_pi_block(out, key, {}, marker);
+    // Remember the range tombstone so when we need to open a new promoted
+    // index block, we can figure out which ranges are still open and need
+    // to be repeated in the data file. Note that apply() also drops ranges
+    // already closed by rt.start, so the accumulator doesn't grow boundless.
+    _pi_write.tombstone_accumulator->apply(std::move(rt));
+}
+
+void sstable::maybe_write_row_tombstone(file_writer& out, const composite& key, const clustering_row& clustered_row) {
+    auto t = clustered_row.tomb();
    if (!t) {
        return;
    }
-
-    auto write_tombstone = [&] (tombstone t, column_mask mask) {
-        write_column_name(out, key, {}, composite::eoc::start);
-        write(out, mask);
-        write_column_name(out, key, {}, composite::eoc::end);
-        write_deletion_time(out, t);
-    };
-
-    write_tombstone(t.regular(), column_mask::range_tombstone);
+    auto rt = range_tombstone(clustered_row.key(), bound_kind::incl_start, clustered_row.key(), bound_kind::incl_end, t.tomb());
+    index_tombstone(out, key, std::move(rt), composite::eoc::none);
+    write_range_tombstone(out, key, composite::eoc::start, key, composite::eoc::end, {}, t.regular());
    if (t.is_shadowable()) {
-        write_tombstone(t.shadowable().tomb(), column_mask::shadowable);
+        write_range_tombstone(out, key, composite::eoc::start, key, composite::eoc::end, {}, t.shadowable().tomb(), column_mask::shadowable);
    }
 }

@@ -1562,27 +1630,26 @@ void sstable::write_range_tombstone(file_writer& out,
        const composite& end,
        composite::eoc end_marker,
        std::vector<bytes_view> suffix,
-        const tombstone t) {
-    if (!t) {
-        return;
+        const tombstone t,
+        column_mask mask) {
+    if (!_schema->is_compound() && (start_marker == composite::eoc::end || end_marker == composite::eoc::start)) {
+        throw std::logic_error(sprint("Cannot represent marker type in range tombstone for non-compound schemas"));
    }
-
-    write_column_name(out, start, suffix, start_marker);
-    column_mask mask = column_mask::range_tombstone;
+    write_range_tombstone_bound(out, *_schema, start, suffix, start_marker);
    write(out, mask);
-    write_column_name(out, end, suffix, end_marker);
+    write_range_tombstone_bound(out, *_schema, end, suffix, end_marker);
    write_deletion_time(out, t);
 }

 void sstable::write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation_view collection) {
-
    auto t = static_pointer_cast<const collection_type_impl>(cdef.type);
    auto mview = t->deserialize_mutation_form(collection);
    const bytes& column_name = cdef.name();
-    write_range_tombstone(out, clustering_key, clustering_key, { bytes_view(column_name) }, mview.tomb);
+    if (mview.tomb) {
+        write_range_tombstone(out, clustering_key, composite::eoc::start, clustering_key, composite::eoc::end, { column_name }, mview.tomb);
+    }
    for (auto& cp: mview.cells) {
-        maybe_flush_pi_block(out, clustering_key, { column_name, cp.first });
-        write_column_name(out, clustering_key, { column_name, cp.first });
+        index_and_write_column_name(out, clustering_key, { column_name, cp.first });
        write_cell(out, cp.second, cdef);
    }
 }
@@ -1592,24 +1659,8 @@ void sstable::write_collection(file_writer& out, const composite& clustering_key
 void sstable::write_clustered_row(file_writer& out, const schema& schema, const clustering_row& clustered_row) {
    auto clustering_key = composite::from_clustering_element(schema, clustered_row.key());

-    if (schema.is_compound() && !schema.is_dense()) {
-        maybe_flush_pi_block(out, clustering_key, { bytes_view() });
-        write_row_marker(out, clustered_row.marker(), clustering_key);
-    }
-    // Before writing cells, range tombstone must be written if the row has any (deletable_row::t).
-    if (clustered_row.tomb()) {
-        maybe_flush_pi_block(out, clustering_key, {});
-        write_row_tombstone(out, clustering_key, clustered_row.tomb());
-        // Because we currently may break a partition to promoted-index blocks
-        // in the middle of a clustered row, we also need to track the current
-        // row's tombstone - not just range tombstones - which may effect the
-        // beginning of a new block.
-        // TODO: consider starting a new block only between rows, so the
-        // following code can be dropped:
-        _pi_write.tombstone_accumulator->apply(range_tombstone(
-                clustered_row.key(), bound_kind::incl_start,
-                clustered_row.key(), bound_kind::incl_end, clustered_row.tomb().tomb()));
-    }
+    maybe_write_row_marker(out, schema, clustered_row.marker(), clustering_key);
+    maybe_write_row_tombstone(out, clustering_key, clustered_row);

    if (schema.clustering_key_size()) {
        column_name_helper::min_max_components(schema, _collector.min_column_names(), _collector.max_column_names(),
@@ -1627,30 +1678,14 @@ void sstable::write_clustered_row(file_writer& out, const schema& schema, const
        }
        assert(column_definition.is_regular());
        atomic_cell_view cell = c.as_atomic_cell();
-        const bytes& column_name = column_definition.name();
-
-        if (schema.is_compound()) {
-            if (schema.is_dense()) {
-                maybe_flush_pi_block(out, composite(), { bytes_view(clustering_key) });
-                write_column_name(out, bytes_view(clustering_key));
-            } else {
-                maybe_flush_pi_block(out, clustering_key, { bytes_view(column_name) });
-                write_column_name(out, clustering_key, { bytes_view(column_name) });
-            }
-        } else {
-            if (schema.is_dense()) {
-                maybe_flush_pi_block(out, composite(), { bytes_view(clustered_row.key().get_component(schema, 0)) });
-                write_column_name(out, bytes_view(clustered_row.key().get_component(schema, 0)));
-            } else {
-                maybe_flush_pi_block(out, composite(), { bytes_view(column_name) });
-                write_column_name(out, bytes_view(column_name));
-            }
-        }
+        std::vector<bytes_view> column_name = { column_definition.name() };
+        index_and_write_column_name(out, clustering_key, column_name);
        write_cell(out, cell, column_definition);
    });
 }

 void sstable::write_static_row(file_writer& out, const schema& schema, const row& static_row) {
+    assert(schema.is_compound());
    static_row.for_each_cell([&] (column_id id, const atomic_cell_or_collection& c) {
        auto&& column_definition = schema.static_column_at(id);
        if (!column_definition.is_atomic()) {
@@ -1660,20 +1695,28 @@ void sstable::write_static_row(file_writer& out, const schema& schema, const row
        }
        assert(column_definition.is_static());
        const auto& column_name = column_definition.name();
-        if (schema.is_compound()) {
-            auto sp = composite::static_prefix(schema);
-            maybe_flush_pi_block(out, sp, { bytes_view(column_name) });
-            write_column_name(out, sp, { bytes_view(column_name) });
-        } else {
-            assert(!schema.is_dense());
-            maybe_flush_pi_block(out, composite(), { bytes_view(column_name) });
-            write_column_name(out, bytes_view(column_name));
-        }
+        auto sp = composite::static_prefix(schema);
+        index_and_write_column_name(out, sp, { bytes_view(column_name) });
        atomic_cell_view cell = c.as_atomic_cell();
        write_cell(out, cell, column_definition);
    });
 }

+void sstable::index_and_write_column_name(file_writer& out,
+         const composite& clustering_element,
+         const std::vector<bytes_view>& column_names,
+         composite::eoc marker) {
+    if (_schema->clustering_key_size()) {
+        bytes_writer_for_column_name w;
+        write_column_name(w, *_schema, clustering_element, column_names, marker);
+        auto&& colname = std::move(w).release();
+        maybe_flush_pi_block(out, clustering_element, colname);
+        write_column_name(out, colname);
+    } else {
+        write_column_name(out, *_schema, clustering_element, column_names, marker);
+    }
+}
+
 static void write_index_header(file_writer& out, disk_string_view<uint16_t>& key, uint64_t pos) {
    write(out, key, pos);
 }
@@ -1855,6 +1898,7 @@ components_writer::components_writer(sstable& sst, const schema& s, file_writer&
 {
    _sst._components->filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance());
    _sst._pi_write.desired_block_size = cfg.promoted_index_block_size.value_or(get_config().column_index_size_in_kb() * 1024);
+    _sst._correctly_serialize_non_compound_range_tombstones = cfg.correctly_serialize_non_compound_range_tombstones;

    prepare_summary(_sst._components->summary, estimated_partitions, _schema.min_index_interval());

@@ -1929,17 +1973,13 @@ stop_iteration components_writer::consume(clustering_row&& cr) {

 stop_iteration components_writer::consume(range_tombstone&& rt) {
    ensure_tombstone_is_written();
-    // Remember the range tombstone so when we need to open a new promoted
-    // index block, we can figure out which ranges are still open and need
-    // to be repeated in the data file. Note that apply() also drops ranges
-    // already closed by rt.start, so the accumulator doesn't grow boundless.
-    _sst._pi_write.tombstone_accumulator->apply(rt);
-    auto start = composite::from_clustering_element(_schema, std::move(rt.start));
+    auto start = composite::from_clustering_element(_schema, rt.start);
    auto start_marker = bound_kind_to_start_marker(rt.start_kind);
-    auto end = composite::from_clustering_element(_schema, std::move(rt.end));
+    auto end = composite::from_clustering_element(_schema, rt.end);
    auto end_marker = bound_kind_to_end_marker(rt.end_kind);
-    _sst.maybe_flush_pi_block(_out, start, {}, start_marker);
-    _sst.write_range_tombstone(_out, std::move(start), start_marker, std::move(end), end_marker, {}, rt.tomb);
+    auto tomb = rt.tomb;
+    _sst.index_tombstone(_out, start, std::move(rt), start_marker);
+    _sst.write_range_tombstone(_out, std::move(start), start_marker, std::move(end), end_marker, {}, tomb);
    return stop_iteration::no;
 }

@@ -2018,12 +2058,13 @@ sstable::read_scylla_metadata(const io_priority_class& pc) {
 }

 void
-sstable::write_scylla_metadata(const io_priority_class& pc, shard_id shard) {
+sstable::write_scylla_metadata(const io_priority_class& pc, shard_id shard, sstable_enabled_features features) {
    auto&& first_key = get_first_decorated_key();
    auto&& last_key = get_last_decorated_key();
    auto sm = create_sharding_metadata(_schema, first_key, last_key, shard);
    _components->scylla_metadata.emplace();
    _components->scylla_metadata->data.set<scylla_metadata_type::Sharding>(std::move(sm));
+    _components->scylla_metadata->data.set<scylla_metadata_type::Features>(std::move(features));

    write_simple<component_type::Scylla>(*_components->scylla_metadata, pc);
 }
@@ -2075,6 +2116,7 @@ sstable_writer::sstable_writer(sstable& sst, const schema& s, uint64_t estimated
    , _backup(cfg.backup)
    , _leave_unsealed(cfg.leave_unsealed)
    , _shard(shard)
+    , _correctly_serialize_non_compound_range_tombstones(cfg.correctly_serialize_non_compound_range_tombstones)
 {
    _sst.generate_toc(_schema.get_compressor_params().get_compressor(), _schema.bloom_filter_fp_chance());
    _sst.write_toc(_pc);
@@ -2084,6 +2126,10 @@ sstable_writer::sstable_writer(sstable& sst, const schema& s, uint64_t estimated
    _components_writer.emplace(_sst, _schema, *_writer, estimated_partitions, cfg, _pc);
 }

+static sstable_enabled_features all_features() {
+    return sstable_enabled_features{(1 << sstable_feature::End) - 1};
+}
+
 void sstable_writer::consume_end_of_stream()
 {
    _components_writer->consume_end_of_stream();
@@ -2093,7 +2139,11 @@ void sstable_writer::consume_end_of_stream()
    _sst.write_filter(_pc);
    _sst.write_statistics(_pc);
    _sst.write_compression(_pc);
-    _sst.write_scylla_metadata(_pc, _shard);
+    auto features = all_features();
+    if (!_correctly_serialize_non_compound_range_tombstones) {
+        features.disable(sstable_feature::NonCompoundRangeTombstones);
+    }
+    _sst.write_scylla_metadata(_pc, _shard, std::move(features));

    if (!_leave_unsealed) {
        _sst.seal_sstable(_backup).get();
@@ -2169,7 +2219,8 @@ future<> sstable::generate_summary(const io_priority_class& pc) {
                options.io_priority_class = pc;
                auto stream = make_file_input_stream(index_file, 0, size, std::move(options));
                return do_with(summary_generator(_components->summary), [this, &pc, stream = std::move(stream), size] (summary_generator& s) mutable {
-                    auto ctx = make_lw_shared<index_consume_entry_context<summary_generator>>(s, std::move(stream), 0, size);
+                    auto ctx = make_lw_shared<index_consume_entry_context<summary_generator>>(
+                            s, trust_promoted_index::yes, std::move(stream), 0, size);
                    return ctx->consume_input(*ctx).finally([ctx] {
                        return ctx->close();
                    }).then([this, ctx, &s] {
@@ -2872,5 +2923,8 @@ mutation_source sstable::as_mutation_source() {
    });
 }

+bool supports_correct_non_compound_range_tombstones() {
+    return service::get_local_storage_service().cluster_supports_reading_correctly_serialized_range_tombstones();
+}

 }
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -130,6 +130,8 @@ struct sstable_open_info;

 class index_reader;

+bool supports_correct_non_compound_range_tombstones();
+
 struct sstable_writer_config {
    std::experimental::optional<size_t> promoted_index_block_size;
    uint64_t max_sstable_size = std::numeric_limits<uint64_t>::max();
@@ -137,6 +139,7 @@ struct sstable_writer_config {
    bool leave_unsealed = false;
    stdx::optional<db::replay_position> replay_position;
    seastar::thread_scheduling_group* thread_scheduling_group = nullptr;
+    bool correctly_serialize_non_compound_range_tombstones = supports_correct_non_compound_range_tombstones();
 };

 class sstable : public enable_lw_shared_from_this<sstable> {
@@ -479,6 +482,10 @@ private:
    lw_shared_ptr<file_input_stream_history> _single_partition_history = make_lw_shared<file_input_stream_history>();
    lw_shared_ptr<file_input_stream_history> _partition_range_history = make_lw_shared<file_input_stream_history>();

+    //FIXME: Set by sstable_writer to influence sstable writing behavior.
+    //       Remove when doing #3012
+    bool _correctly_serialize_non_compound_range_tombstones;
+
    // _pi_write is used temporarily for building the promoted
    // index (column sample) of one partition when writing a new sstable.
    struct {
@@ -501,6 +508,10 @@ private:
            const std::vector<bytes_view>& column_names,
            composite::eoc marker = composite::eoc::none);

+    void maybe_flush_pi_block(file_writer& out,
+            const composite& clustering_key,
+            bytes colname);
+
    schema_ptr _schema;
    sstring _dir;
    unsigned long _generation = 0;
@@ -534,7 +545,7 @@ private:
    void write_compression(const io_priority_class& pc);

    future<> read_scylla_metadata(const io_priority_class& pc);
-    void write_scylla_metadata(const io_priority_class& pc, shard_id shard = engine().cpu_id());
+    void write_scylla_metadata(const io_priority_class& pc, shard_id shard, sstable_enabled_features features);

    future<> read_filter(const io_priority_class& pc);

@@ -598,20 +609,23 @@ private:
    bool filter_has_key(const schema& s, const dht::decorated_key& dk) { return filter_has_key(key::from_partition_key(s, dk._key)); }

    // NOTE: functions used to generate sstable components.
-    void write_row_marker(file_writer& out, const row_marker& marker, const composite& clustering_key);
+    void maybe_write_row_marker(file_writer& out, const schema& schema, const row_marker& marker, const composite& clustering_key);
    void write_clustered_row(file_writer& out, const schema& schema, const clustering_row& clustered_row);
    void write_static_row(file_writer& out, const schema& schema, const row& static_row);
    void write_cell(file_writer& out, atomic_cell_view cell, const column_definition& cdef);
-    void write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none);
-    void write_column_name(file_writer& out, bytes_view column_names);
-    void write_range_tombstone(file_writer& out, const composite& start, composite::eoc start_marker, const composite& end, composite::eoc end_marker, std::vector<bytes_view> suffix, const tombstone t);
-    void write_range_tombstone(file_writer& out, const composite& start, const composite& end, std::vector<bytes_view> suffix, const tombstone t) {
-        write_range_tombstone(out, start, composite::eoc::start, end, composite::eoc::end, std::move(suffix), std::move(t));
-    }
+    void write_range_tombstone(file_writer& out, const composite& start, composite::eoc start_marker, const composite& end, composite::eoc end_marker,
+                               std::vector<bytes_view> suffix, const tombstone t, const column_mask = column_mask::range_tombstone);
+    void write_range_tombstone_bound(file_writer& out, const schema& s, const composite& clustering_element, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none);
+    void index_tombstone(file_writer& out, const composite& key, range_tombstone&& rt, composite::eoc marker);
    void write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation_view collection);
-    void write_row_tombstone(file_writer& out, const composite& key, const row_tombstone t);
+    void maybe_write_row_tombstone(file_writer& out, const composite& key, const clustering_row& clustered_row);
    void write_deletion_time(file_writer& out, const tombstone t);

+    void index_and_write_column_name(file_writer& out,
+            const composite& clustering,
+            const std::vector<bytes_view>& column_names,
+            composite::eoc marker = composite::eoc::none);
+
    stdx::optional<std::pair<uint64_t, uint64_t>> get_sample_indexes_for_range(const dht::token_range& range);
 public:
    std::unique_ptr<index_reader> get_index_reader(const io_priority_class& pc);
@@ -622,6 +636,14 @@ public:
        return has_component(component_type::Scylla);
    }

+    bool has_correct_promoted_index_entries() const {
+        return _schema->is_compound() || !has_scylla_component() || _components->scylla_metadata->has_feature(sstable_feature::NonCompoundPIEntries);
+    }
+
+    bool has_correct_non_compound_range_tombstones() const {
+        return _schema->is_compound() || !has_scylla_component() || _components->scylla_metadata->has_feature(sstable_feature::NonCompoundRangeTombstones);
+    }
+
    bool filter_has_key(const key& key) {
        return _components->filter->is_present(bytes_view(key));
    }
@@ -724,6 +746,11 @@ public:
 using shared_sstable = lw_shared_ptr<sstable>;
 using sstable_list = std::unordered_set<shared_sstable>;

+shared_sstable make_sstable(schema_ptr schema, sstring dir, int64_t generation, sstable::version_types v, sstable::format_types f, gc_clock::time_point now = gc_clock::now(),
+            io_error_handler_gen error_handler_gen = default_io_error_handler_gen(), size_t buffer_size = 128*1024);
+
+
+
 struct entry_descriptor {
    sstring ks;
    sstring cf;
@@ -819,6 +846,7 @@ class sstable_writer {
    std::unique_ptr<file_writer> _writer;
    stdx::optional<components_writer> _components_writer;
    shard_id _shard; // Specifies which shard new sstable will belong to.
+    bool _correctly_serialize_non_compound_range_tombstones;
 private:
    void prepare_file_writer();
    void finish_file_writer();
@@ -828,7 +856,8 @@ public:
    ~sstable_writer();
    sstable_writer(sstable_writer&& o) : _sst(o._sst), _schema(o._schema), _pc(o._pc), _backup(o._backup),
            _leave_unsealed(o._leave_unsealed), _compression_enabled(o._compression_enabled), _writer(std::move(o._writer)),
-            _components_writer(std::move(o._components_writer)), _shard(o._shard) {}
+            _components_writer(std::move(o._components_writer)), _shard(o._shard),
+            _correctly_serialize_non_compound_range_tombstones(o._correctly_serialize_non_compound_range_tombstones) { }
    void consume_new_partition(const dht::decorated_key& dk) { return _components_writer->consume_new_partition(dk); }
    void consume(tombstone t) { _components_writer->consume(t); }
    stop_iteration consume(static_row&& sr) { return _components_writer->consume(std::move(sr)); }
--- a/sstables/types.hh
+++ b/sstables/types.hh
@@ -358,6 +358,28 @@ struct sharding_metadata {
    auto describe_type(Describer f) { return f(token_ranges); }
 };

+// Scylla-specific list of features an sstable supports.
+enum sstable_feature : uint8_t {
+    NonCompoundPIEntries = 0,       // See #2993
+    NonCompoundRangeTombstones = 1, // See #2986
+    End = 2
+};
+
+// Scylla-specific features enabled for a particular sstable.
+struct sstable_enabled_features {
+    uint64_t enabled_features;
+
+    bool is_enabled(sstable_feature f) const {
+        return enabled_features & (1 << f);
+    }
+
+    void disable(sstable_feature f) {
+        enabled_features &= ~(1<< f);
+    }
+
+    template <typename Describer>
+    auto describe_type(Describer f) { return f(enabled_features); }
+};

 // Numbers are found on disk, so they do matter. Also, setting their sizes of
 // that of an uint32_t is a bit wasteful, but it simplifies the code a lot
@@ -369,16 +391,22 @@ enum class metadata_type : uint32_t {
    Stats = 2,
 };

-
 enum class scylla_metadata_type : uint32_t {
    Sharding = 1,
+    Features = 2,
 };

 struct scylla_metadata {
    disk_set_of_tagged_union<scylla_metadata_type,
-            disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Sharding, sharding_metadata>
+            disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Sharding, sharding_metadata>,
+            disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Features, sstable_enabled_features>
            > data;

+    bool has_feature(sstable_feature f) const {
+        auto features = data.get<scylla_metadata_type::Features, sstable_enabled_features>();
+        return features && features->is_enabled(f);
+    }
+
    template <typename Describer>
    auto describe_type(Describer f) { return f(data); }
 };
--- a/streaming/stream_coordinator.cc
+++ b/streaming/stream_coordinator.cc
@@ -100,7 +100,7 @@ void stream_coordinator::connect_all_stream_sessions() {
    for (auto& x : _peer_sessions) {
        auto& session = x.second;
        session->start();
-        sslog.info("[Stream #{}] Beginning stream session with {}", session->plan_id(), session->peer);
+        sslog.debug("[Stream #{}] Beginning stream session with {}", session->plan_id(), session->peer);
    }
 }

--- a/streaming/stream_manager.cc
+++ b/streaming/stream_manager.cc
@@ -291,4 +291,15 @@ void stream_manager::on_restart(inet_address endpoint, endpoint_state ep_state)
    }
 }

+void stream_manager::on_dead(inet_address endpoint, endpoint_state ep_state) {
+    if (has_peer(endpoint) && ep_state.is_shutdown()) {
+        sslog.info("stream_manager: Close all stream_session with peer = {} in on_dead", endpoint);
+        get_stream_manager().invoke_on_all([endpoint] (auto& sm) {
+            sm.fail_sessions(endpoint);
+        }).handle_exception([endpoint] (auto ep) {
+            sslog.warn("stream_manager: Fail to close sessions peer = {} in on_dead", endpoint);
+        });
+    }
+}
+
 } // namespace streaming
--- a/streaming/stream_manager.hh
+++ b/streaming/stream_manager.hh
@@ -156,7 +156,7 @@ public:
    virtual void before_change(inet_address endpoint, endpoint_state current_state, application_state new_state_key, const versioned_value& new_value) override {}
    virtual void on_change(inet_address endpoint, application_state state, const versioned_value& value) override {}
    virtual void on_alive(inet_address endpoint, endpoint_state state) override {}
-    virtual void on_dead(inet_address endpoint, endpoint_state state) override {}
+    virtual void on_dead(inet_address endpoint, endpoint_state state) override;
    virtual void on_remove(inet_address endpoint) override;
    virtual void on_restart(inet_address endpoint, endpoint_state ep_state) override;

--- a/streaming/stream_result_future.cc
+++ b/streaming/stream_result_future.cc
@@ -54,7 +54,7 @@ future<stream_state> stream_result_future::init_sending_side(UUID plan_id_, sstr
        sr->add_event_listener(listener);
    }

-    sslog.info("[Stream #{}] Executing streaming plan for {}", plan_id_,  description_);
+    sslog.info("[Stream #{}] Executing streaming plan for {} with peers={}, master", plan_id_,  description_, coordinator_->get_peers());

    // Initialize and start all sessions
    for (auto& session : coordinator_->get_all_stream_sessions()) {
@@ -74,7 +74,7 @@ shared_ptr<stream_result_future> stream_result_future::init_receiving_side(UUID
        sslog.warn(err.c_str());
        throw std::runtime_error(err);
    }
-    sslog.info("[Stream #{}] Creating new streaming plan for {}, with {}", plan_id, description, from);
+    sslog.info("[Stream #{}] Executing streaming plan for {} with peers={}, slave", plan_id, description, from);
    bool is_receiving = true;
    sr = make_shared<stream_result_future>(plan_id, description, is_receiving);
    sm.register_receiving(sr);
@@ -83,7 +83,7 @@ shared_ptr<stream_result_future> stream_result_future::init_receiving_side(UUID

 void stream_result_future::handle_session_prepared(shared_ptr<stream_session> session) {
    auto si = session->make_session_info();
-    sslog.info("[Stream #{}] Prepare completed with {}. Receiving {}, sending {}",
+    sslog.debug("[Stream #{}] Prepare completed with {}. Receiving {}, sending {}",
               session->plan_id(),
               session->peer,
               si.get_total_files_to_receive(),
@@ -94,7 +94,7 @@ void stream_result_future::handle_session_prepared(shared_ptr<stream_session> se
 }

 void stream_result_future::handle_session_complete(shared_ptr<stream_session> session) {
-    sslog.info("[Stream #{}] Session with {} is complete, state={}", session->plan_id(), session->peer, session->get_state());
+    sslog.debug("[Stream #{}] Session with {} is complete, state={}", session->plan_id(), session->peer, session->get_state());
    auto event = session_complete_event(session);
    fire_stream_event(std::move(event));
    auto si = session->make_session_info();
@@ -120,25 +120,25 @@ void stream_result_future::maybe_complete() {
            sm.show_streams();
        }
        auto duration = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - _start_time).count();
-        sm.get_progress_on_all_shards(plan_id).then([plan_id, duration] (auto sbytes) {
-            auto tx_bw = sstring("+inf");
-            auto rx_bw = sstring("+inf");
+        auto stats = make_lw_shared<sstring>("");
+        sm.get_progress_on_all_shards(plan_id).then([plan_id, duration, stats] (auto sbytes) {
+            auto tx_bw = sstring("0");
+            auto rx_bw = sstring("0");
            if (std::fabs(duration) > FLT_EPSILON) {
-                tx_bw = sprint("%.3f", sbytes.bytes_sent / duration / (1024 * 1024));
-                rx_bw = sprint("%.3f", sbytes.bytes_received  / duration / (1024 * 1024));
+                tx_bw = sprint("%.2f", sbytes.bytes_sent / duration / 1024);
+                rx_bw = sprint("%.2f", sbytes.bytes_received  / duration / 1024);
            }
-            sslog.info("[Stream #{}] bytes_sent = {}, bytes_received = {}, tx_bandwidth = {} MiB/s, rx_bandwidth = {} MiB/s",
-                    plan_id, sbytes.bytes_sent, sbytes.bytes_received, tx_bw, rx_bw);
+            *stats = sprint("tx=%ld KiB, %s KiB/s, rx=%ld KiB, %s KiB/s", sbytes.bytes_sent / 1024, tx_bw, sbytes.bytes_received / 1024, rx_bw);
        }).handle_exception([plan_id] (auto ep) {
            sslog.warn("[Stream #{}] Fail to get progess on all shards: {}", plan_id, ep);
-        }).finally([this, plan_id, &sm] {
+        }).finally([this, plan_id, stats, &sm] () {
            sm.remove_stream(plan_id);
            auto final_state = get_current_state();
            if (final_state.has_failed_session()) {
-                sslog.warn("[Stream #{}] Stream failed for streaming plan {}, peers={}", plan_id, description, _coordinator->get_peers());
+                sslog.warn("[Stream #{}] Streaming plan for {} failed, peers={}, {}", plan_id, description, _coordinator->get_peers(), *stats);
                _done.set_exception(stream_exception(final_state, "Stream failed"));
            } else {
-                sslog.info("[Stream #{}] All sessions completed for streaming plan {}, peers={}", plan_id, description, _coordinator->get_peers());
+                sslog.info("[Stream #{}] Streaming plan for {} succeeded, peers={}, {}", plan_id, description, _coordinator->get_peers(), *stats);
                _done.set_value(final_state);
            }
        });
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -176,11 +176,20 @@ void stream_session::init_messaging_service_handler() {
            });
        });
    });
-    ms().register_complete_message([] (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id) {
+    ms().register_complete_message([] (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed) {
        const auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
-        // Be compatible with old version. Do nothing but return a ready future.
-        sslog.debug("[Stream #{}] COMPLETE_MESSAGE from {} dst_cpu_id={}", plan_id, from, dst_cpu_id);
-        return make_ready_future<>();
+        if (failed && *failed) {
+            return smp::submit_to(dst_cpu_id, [plan_id, from, dst_cpu_id] () {
+                auto session = get_session(plan_id, from, "COMPLETE_MESSAGE");
+                sslog.debug("[Stream #{}] COMPLETE_MESSAGE with error flag from {} dst_cpu_id={}", plan_id, from, dst_cpu_id);
+                session->received_failed_complete_message();
+                return make_ready_future<>();
+            });
+        } else {
+            // Be compatible with old version. Do nothing but return a ready future.
+            sslog.debug("[Stream #{}] COMPLETE_MESSAGE from {} dst_cpu_id={}", plan_id, from, dst_cpu_id);
+            return make_ready_future<>();
+        }
    });
 }

@@ -227,7 +236,9 @@ future<> stream_session::on_initialization_complete() {
            for (auto& summary : msg.summaries) {
                this->prepare_receiving(summary);
            }
-            _stream_result->handle_session_prepared(this->shared_from_this());
+            if (_stream_result) {
+                _stream_result->handle_session_prepared(this->shared_from_this());
+            }
        } catch (...) {
            sslog.warn("[Stream #{}] Fail to send PREPARE_MESSAGE to {}, {}", this->plan_id(), id, std::current_exception());
            throw;
@@ -248,9 +259,19 @@ future<> stream_session::on_initialization_complete() {
    });
 }

+void stream_session::received_failed_complete_message() {
+    sslog.info("[Stream #{}] Received failed complete message, peer={}", plan_id(), peer);
+    _received_failed_complete_message = true;
+    close_session(stream_session_state::FAILED);
+}
+
+void stream_session::abort() {
+    sslog.info("[Stream #{}] Aborted stream session={}, peer={}, is_initialized={}", plan_id(), this, peer, is_initialized());
+    close_session(stream_session_state::FAILED);
+}
+
 void stream_session::on_error() {
-    sslog.warn("[Stream #{}] Streaming error occurred", plan_id());
-    // fail session
+    sslog.warn("[Stream #{}] Streaming error occurred, peer={}", plan_id(), peer);
    close_session(stream_session_state::FAILED);
 }

@@ -300,7 +321,9 @@ future<prepare_message> stream_session::prepare(std::vector<stream_request> requ
        }
    }
    prepare.dst_cpu_id = engine().cpu_id();;
-    _stream_result->handle_session_prepared(shared_from_this());
+    if (_stream_result) {
+        _stream_result->handle_session_prepared(shared_from_this());
+    }
    return make_ready_future<prepare_message>(std::move(prepare));
 }

@@ -309,10 +332,6 @@ void stream_session::follower_start_sent() {
    this->start_streaming_files();
 }

-void stream_session::session_failed() {
-    close_session(stream_session_state::FAILED);
-}
-
 session_info stream_session::make_session_info() {
    std::vector<stream_summary> receiving_summaries;
    for (auto& receiver : _receivers) {
@@ -339,28 +358,41 @@ void stream_session::transfer_task_completed(UUID cf_id) {
    maybe_completed();
 }

-void stream_session::send_complete_message() {
+void stream_session::transfer_task_completed_all() {
+    _transfers.clear();
+    sslog.debug("[Stream #{}] transfer task_completed: all done, stream_receive_task.size={} stream_transfer_task.size={}",
+        plan_id(), _receivers.size(), _transfers.size());
+    maybe_completed();
+}
+
+void stream_session::send_failed_complete_message() {
+    if (!is_initialized()) {
+        return;
+    }
+    auto plan_id = this->plan_id();
+    if (_received_failed_complete_message) {
+        sslog.debug("[Stream #{}] Skip sending failed message back to peer", plan_id);
+        return;
+    }
    if (!_complete_sent) {
        _complete_sent = true;
    } else {
        return;
    }
    auto id = msg_addr{this->peer, this->dst_cpu_id};
-    auto plan_id = this->plan_id();
    sslog.debug("[Stream #{}] SEND COMPLETE_MESSAGE to {}", plan_id, id);
    auto session = shared_from_this();
-    this->ms().send_complete_message(id, plan_id, this->dst_cpu_id).then([session, id, plan_id] {
+    bool failed = true;
+    this->ms().send_complete_message(id, plan_id, this->dst_cpu_id, failed).then([session, id, plan_id] {
        sslog.debug("[Stream #{}] GOT COMPLETE_MESSAGE Reply from {}", plan_id, id.addr);
    }).handle_exception([session, id, plan_id] (auto ep) {
-        sslog.warn("[Stream #{}] COMPLETE_MESSAGE for {} has failed: {}", plan_id, id.addr, ep);
-        session->on_error();
+        sslog.debug("[Stream #{}] COMPLETE_MESSAGE for {} has failed: {}", plan_id, id.addr, ep);
    });
 }

 bool stream_session::maybe_completed() {
    bool completed = _receivers.empty() && _transfers.empty();
    if (completed) {
-        send_complete_message();
        sslog.debug("[Stream #{}] maybe_completed: {} -> COMPLETE: session={}, peer={}", plan_id(), _state, this, peer);
        close_session(stream_session_state::COMPLETE);
    }
@@ -379,11 +411,15 @@ void stream_session::start_streaming_files() {
    if (!_transfers.empty()) {
        set_state(stream_session_state::STREAMING);
    }
-    for (auto it = _transfers.begin(); it != _transfers.end();) {
-        stream_transfer_task& task = it->second;
-        it++;
-        task.start();
-    }
+    do_for_each(_transfers.begin(), _transfers.end(), [this] (auto& item) {
+        sslog.debug("[Stream #{}] Start to send cf_id={}", this->plan_id(), item.first);
+        return item.second.execute();
+    }).then([this] {
+        this->transfer_task_completed_all();
+    }).handle_exception([this] (auto ep) {
+        sslog.warn("[Stream #{}] Failed to send: {}", this->plan_id(), ep);
+        this->on_error();
+    });
 }

 std::vector<column_family*> stream_session::get_column_family_stores(const sstring& keyspace, const std::vector<sstring>& column_families) {
@@ -460,12 +496,15 @@ void stream_session::close_session(stream_session_state final_state) {
                receiving_failed(x.first);
                task.abort();
            }
+            send_failed_complete_message();
        }

        // Note that we shouldn't block on this close because this method is called on the handler
        // incoming thread (so we would deadlock).
        //handler.close();
-        _stream_result->handle_session_complete(shared_from_this());
+        if (_stream_result) {
+            _stream_result->handle_session_complete(shared_from_this());
+        }

        sslog.debug("[Stream #{}] close_session session={}, state={}, cancel keep_alive timer", plan_id(), this, final_state);
        _keep_alive.cancel();
@@ -480,15 +519,19 @@ void stream_session::start() {
    }
    auto connecting = netw::get_local_messaging_service().get_preferred_ip(peer);
    if (peer == connecting) {
-        sslog.info("[Stream #{}] Starting streaming to {}", plan_id(), peer);
+        sslog.debug("[Stream #{}] Starting streaming to {}", plan_id(), peer);
    } else {
-        sslog.info("[Stream #{}] Starting streaming to {} through {}", plan_id(), peer, connecting);
+        sslog.debug("[Stream #{}] Starting streaming to {} through {}", plan_id(), peer, connecting);
    }
    on_initialization_complete().handle_exception([this] (auto ep) {
        this->on_error();
    });
 }

+bool stream_session::is_initialized() const {
+    return bool(_stream_result);
+}
+
 void stream_session::init(shared_ptr<stream_result_future> stream_result_) {
    _stream_result = stream_result_;
    _keep_alive.set_callback([this] {
--- a/streaming/stream_session.hh
+++ b/streaming/stream_session.hh
@@ -151,7 +151,7 @@ public:
     * Each {@code StreamSession} is identified by this InetAddress which is broadcast address of the node streaming.
     */
    inet_address peer;
-    unsigned dst_cpu_id;
+    unsigned dst_cpu_id = 0;
 private:
    // should not be null when session is started
    shared_ptr<stream_result_future> _stream_result;
@@ -174,11 +174,12 @@ private:

    stream_session_state _state = stream_session_state::INITIALIZED;
    bool _complete_sent = false;
+    bool _received_failed_complete_message = false;

-    // If the session is idle for 300 minutes, close the session
-    std::chrono::seconds _keep_alive_timeout{60 * 300};
-    // Check every 10 minutes
-    std::chrono::seconds _keep_alive_interval{60 * 10};
+    // If the session is idle for 10 minutes, close the session
+    std::chrono::seconds _keep_alive_timeout{60 * 10};
+    // Check every 1 minutes
+    std::chrono::seconds _keep_alive_interval{60};
    timer<lowres_clock> _keep_alive;
    stream_bytes _last_stream_bytes;
    lowres_clock::time_point _last_stream_progress;
@@ -231,6 +232,8 @@ public:

    void start();

+    bool is_initialized() const;
+
    /**
     * Request data fetch task to this session.
     *
@@ -299,6 +302,10 @@ public:
     */
    void on_error();

+    void abort();
+
+    void received_failed_complete_message();
+
    /**
     * Prepare this session for sending/receiving files.
     */
@@ -311,11 +318,6 @@ public:
     */
    void complete();

-    /**
-     * Call back on receiving {@code StreamMessage.Type.SESSION_FAILED} message.
-     */
-    void session_failed();
-
    /**
     * @return Current snapshot of this session info.
     */
@@ -333,8 +335,9 @@ public:

    void receive_task_completed(UUID cf_id);
    void transfer_task_completed(UUID cf_id);
+    void transfer_task_completed_all();
 private:
-    void send_complete_message();
+    void send_failed_complete_message();
    bool maybe_completed();
    void prepare_receiving(stream_summary& summary);
    void start_streaming_files();
--- a/streaming/stream_transfer_task.cc
+++ b/streaming/stream_transfer_task.cc
@@ -134,7 +134,7 @@ future<> send_mutations(lw_shared_ptr<send_info> si) {
    });
 }

-void stream_transfer_task::start() {
+future<> stream_transfer_task::execute() {
    auto plan_id = session->plan_id();
    auto cf_id = this->cf_id;
    auto dst_cpu_id = session->dst_cpu_id;
@@ -143,7 +143,7 @@ void stream_transfer_task::start() {
    sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}", plan_id, cf_id);
    sort_and_merge_ranges();
    _shard_ranges = dht::split_ranges_to_shards(_ranges, *schema);
-    parallel_for_each(_shard_ranges, [this, dst_cpu_id, plan_id, cf_id, id] (auto& item) {
+    return parallel_for_each(_shard_ranges, [this, dst_cpu_id, plan_id, cf_id, id] (auto& item) {
        auto& shard = item.first;
        auto& prs = item.second;
        return session->get_db().invoke_on(shard, [plan_id, cf_id, id, dst_cpu_id, prs = std::move(prs)] (database& db) mutable {
@@ -160,10 +160,9 @@ void stream_transfer_task::start() {
    }).then([this, id, plan_id, cf_id] {
        sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE Reply from {}", plan_id, id.addr);
        session->start_keep_alive_timer();
-        session->transfer_task_completed(cf_id);
    }).handle_exception([this, plan_id, id] (auto ep){
        sslog.warn("[Stream #{}] stream_transfer_task: Fail to send to {}: {}", plan_id, id, ep);
-        this->session->on_error();
+        std::rethrow_exception(ep);
    });
 }

--- a/streaming/stream_transfer_task.hh
+++ b/streaming/stream_transfer_task.hh
@@ -78,7 +78,7 @@ public:
        return _total_size;
    }

-    void start();
+    future<> execute();

    void append_ranges(const dht::token_range_vector& ranges);
    void sort_and_merge_ranges();
--- a/test.py
+++ b/test.py
@@ -81,7 +81,7 @@ boost_tests = [
    'virtual_reader_test',
    'counter_test',
    'cell_locker_test',
-    'clustering_ranges_walker_test',
+    'view_schema_test',
 ]

 other_tests = [
@@ -128,7 +128,6 @@ if __name__ == "__main__":
                        help='Verbose reporting')
    args = parser.parse_args()

-    black_hole = open('/dev/null', 'w')
    print_status = print_status_verbose if args.verbose else print_status_short

    test_to_run = []
@@ -136,9 +135,9 @@ if __name__ == "__main__":
    for mode in modes_to_run:
        prefix = os.path.join('build', mode, 'tests')
        for test in other_tests:
-            test_to_run.append((os.path.join(prefix, test), 'other'))
+            test_to_run.append((os.path.join(prefix, test), 'other', '-c2 -m4G'.split()))
        for test in boost_tests:
-            test_to_run.append((os.path.join(prefix, test), 'boost'))
+            test_to_run.append((os.path.join(prefix, test), 'boost', '-c2 -m4G'.split()))

    if 'release' in modes_to_run:
        test_to_run.append(('build/release/tests/lsa_async_eviction_test', 'other',
@@ -152,11 +151,9 @@ if __name__ == "__main__":
        test_to_run.append(('build/release/tests/row_cache_alloc_stress', 'other',
                            '-c1 -m1G'.split()))
        test_to_run.append(('build/release/tests/sstable_test', 'boost', ['-c1']))
-        test_to_run.append(('build/release/tests/view_schema_test', 'boost', ['-c1']))
        test_to_run.append(('build/release/tests/row_cache_stress_test', 'other', '-c1 -m1G --seconds 10'.split()))
    if 'debug' in modes_to_run:
        test_to_run.append(('build/debug/tests/sstable_test', 'boost', ['-c1']))
-        test_to_run.append(('build/debug/tests/view_schema_test', 'boost', ['-c1']))

    if args.name:
        test_to_run = [t for t in test_to_run if args.name in t[0]]
@@ -168,6 +165,7 @@ if __name__ == "__main__":
    # disable false positive due to new (with_alignment(...)) ...
    env['ASAN_OPTIONS'] = 'alloc_dealloc_mismatch=0'
    env['UBSAN_OPTIONS'] = 'print_stacktrace=1'
+    env['BOOST_TEST_CATCH_SYSTEM_ERRORS'] = 'no'
    for n, test in enumerate(test_to_run):
        path = test[0]
        exec_args = test[2] if len(test) >= 3 else []
--- a/tests/compound_test.cc
+++ b/tests/compound_test.cc
@@ -291,7 +291,7 @@ BOOST_AUTO_TEST_CASE(test_composite_serialize_value) {

 BOOST_AUTO_TEST_CASE(test_composite_from_exploded) {
    using components = std::vector<composite::component>;
-    BOOST_REQUIRE_EQUAL(composite::from_exploded({bytes_view(bytes({'e', 'l', '1'}))}, composite::eoc::start).components(),
+    BOOST_REQUIRE_EQUAL(composite::from_exploded({bytes_view(bytes({'e', 'l', '1'}))}, true, composite::eoc::start).components(),
                        components({std::make_pair(bytes("el1"), composite::eoc::start)}));
 }

--- a/tests/cql_query_test.cc
+++ b/tests/cql_query_test.cc
@@ -753,6 +753,40 @@ SEASTAR_TEST_CASE(test_range_deletion_scenarios) {
    });
 }

+SEASTAR_TEST_CASE(test_range_deletion_scenarios_with_compact_storage) {
+    return do_with_cql_env_thread([] (auto& e) {
+        e.execute_cql("create table cf (p int, c int, v text, primary key (p, c)) with compact storage;").get();
+        for (auto i = 0; i < 10; ++i) {
+            e.execute_cql(sprint("insert into cf (p, c, v) values (1, %d, 'abc');", i)).get();
+        }
+
+        try {
+            e.execute_cql("delete from cf where p = 1 and c <= 3").get();
+            BOOST_FAIL("should've thrown");
+        } catch (...) { }
+        try {
+            e.execute_cql("delete from cf where p = 1 and c >= 0").get();
+            BOOST_FAIL("should've thrown");
+        } catch (...) { }
+        try {
+            e.execute_cql("delete from cf where p = 1 and c > 0 and c <= 3").get();
+            BOOST_FAIL("should've thrown");
+        } catch (...) { }
+        try {
+            e.execute_cql("delete from cf where p = 1 and c >= 0 and c < 3").get();
+            BOOST_FAIL("should've thrown");
+        } catch (...) { }
+        try {
+            e.execute_cql("delete from cf where p = 1 and c > 0 and c < 3").get();
+            BOOST_FAIL("should've thrown");
+        } catch (...) { }
+        try {
+            e.execute_cql("delete from cf where p = 1 and c >= 0 and c <= 3").get();
+            BOOST_FAIL("should've thrown");
+        } catch (...) { }
+    });
+}
+
 SEASTAR_TEST_CASE(test_map_insert_update) {
    return do_with_cql_env([] (auto& e) {
        auto make_my_map_type = [] { return map_type_impl::get_instance(int32_type, int32_type, true); };
--- a/tests/cql_test_env.cc
+++ b/tests/cql_test_env.cc
@@ -120,7 +120,7 @@ public:
        });
    }

-    virtual future<bytes> prepare(sstring query) override {
+    virtual future<cql3::prepared_cache_key_type> prepare(sstring query) override {
        return qp().invoke_on_all([query, this] (auto& local_qp) {
            auto qs = this->make_query_state();
            return local_qp.prepare(query, *qs).finally([qs] {}).discard_result();
@@ -130,7 +130,7 @@ public:
    }

    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute_prepared(
-        bytes id,
+        cql3::prepared_cache_key_type id,
        std::vector<cql3::raw_value> values) override
    {
        auto prepared = local_qp().get_prepared(id);
--- a/tests/cql_test_env.hh
+++ b/tests/cql_test_env.hh
@@ -32,6 +32,7 @@
 #include "transport/messages/result_message_base.hh"
 #include "cql3/query_options_fwd.hh"
 #include "cql3/values.hh"
+#include "cql3/prepared_statements_cache.hh"
 #include "bytes.hh"
 #include "schema.hh"

@@ -43,7 +44,7 @@ namespace cql3 {

 class not_prepared_exception : public std::runtime_error {
 public:
-    not_prepared_exception(const bytes& id) : std::runtime_error(sprint("Not prepared: %s", id)) {}
+    not_prepared_exception(const cql3::prepared_cache_key_type& id) : std::runtime_error(sprint("Not prepared: %s", id)) {}
 };

 namespace db {
@@ -59,10 +60,10 @@ public:
    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute_cql(
        const sstring& text, std::unique_ptr<cql3::query_options> qo) = 0;

-    virtual future<bytes> prepare(sstring query) = 0;
+    virtual future<cql3::prepared_cache_key_type> prepare(sstring query) = 0;

    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute_prepared(
-        bytes id, std::vector<cql3::raw_value> values) = 0;
+        cql3::prepared_cache_key_type id, std::vector<cql3::raw_value> values) = 0;

    virtual future<> create_table(std::function<schema(const sstring&)> schema_maker) = 0;

--- a/tests/loading_cache_test.cc
+++ b/tests/loading_cache_test.cc
@@ -0,0 +1,321 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/test/unit_test.hpp>
+#include "utils/loading_shared_values.hh"
+#include "utils/loading_cache.hh"
+#include <seastar/core/file.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sleep.hh>
+
+
+#include "seastarx.hh"
+
+#include "tests/test-utils.hh"
+#include "tmpdir.hh"
+#include "log.hh"
+
+#include <vector>
+#include <numeric>
+#include <random>
+
+/// Get a random integer in the [0, max) range.
+/// \param upper bound of the random value range
+/// \return The uniformly distributed random integer from the [0, \ref max) range.
+static int rand_int(int max) {
+    std::random_device rd;     // only used once to initialise (seed) engine
+    std::mt19937 rng(rd());    // random-number engine used (Mersenne-Twister in this case)
+    std::uniform_int_distribution<int> uni(0, max - 1); // guaranteed unbiased
+    return uni(rng);
+}
+
+
+#include "disk-error-handler.hh"
+
+thread_local disk_error_signal_type general_disk_error;
+thread_local disk_error_signal_type commit_error;
+
+static const sstring test_file_name = "loading_cache_test.txt";
+static const sstring test_string = "1";
+static bool file_prepared = false;
+static constexpr int num_loaders = 1000;
+
+static logging::logger test_logger("loading_cache_test");
+
+static thread_local int load_count;
+static const tmpdir& get_tmpdir() {
+    static thread_local tmpdir tmp;
+    return tmp;
+}
+
+static future<> prepare() {
+    if (file_prepared) {
+        return make_ready_future<>();
+    }
+
+    return open_file_dma((boost::filesystem::path(get_tmpdir().path) / test_file_name.c_str()).c_str(), open_flags::create | open_flags::wo).then([] (file f) {
+        return do_with(std::move(f), [] (file& f) {
+            return f.dma_write(0, test_string.c_str(), test_string.size() + 1).then([] (size_t s) {
+                BOOST_REQUIRE_EQUAL(s, test_string.size() + 1);
+                file_prepared = true;
+            });
+        });
+    });
+}
+
+static future<sstring> loader(const int& k) {
+    return open_file_dma((boost::filesystem::path(get_tmpdir().path) / test_file_name.c_str()).c_str(), open_flags::ro).then([] (file f) -> future<sstring> {
+        return do_with(std::move(f), [] (file& f) -> future<sstring> {
+            return f.dma_read_exactly<char>(0, test_string.size() + 1).then([] (auto buf) {
+                sstring str(buf.get());
+                BOOST_REQUIRE_EQUAL(str, test_string);
+                ++load_count;
+                return make_ready_future<sstring>(std::move(str));
+            });
+        });
+    });
+}
+
+SEASTAR_TEST_CASE(test_loading_shared_values_parallel_loading_same_key) {
+    return seastar::async([] {
+        std::vector<int> ivec(num_loaders);
+        load_count = 0;
+        utils::loading_shared_values<int, sstring> shared_values;
+        std::list<typename utils::loading_shared_values<int, sstring>::entry_ptr> anchors_list;
+
+        prepare().get();
+
+        std::fill(ivec.begin(), ivec.end(), 0);
+
+        parallel_for_each(ivec, [&] (int& k) {
+            return shared_values.get_or_load(k, loader).then([&] (auto entry_ptr) {
+                anchors_list.emplace_back(std::move(entry_ptr));
+            });
+        }).get();
+
+        // "loader" must be called exactly once
+        BOOST_REQUIRE_EQUAL(load_count, 1);
+        BOOST_REQUIRE_EQUAL(shared_values.size(), 1);
+        anchors_list.clear();
+    });
+}
+
+SEASTAR_TEST_CASE(test_loading_shared_values_parallel_loading_different_keys) {
+    return seastar::async([] {
+        std::vector<int> ivec(num_loaders);
+        load_count = 0;
+        utils::loading_shared_values<int, sstring> shared_values;
+        std::list<typename utils::loading_shared_values<int, sstring>::entry_ptr> anchors_list;
+
+        prepare().get();
+
+        std::iota(ivec.begin(), ivec.end(), 0);
+
+        parallel_for_each(ivec, [&] (int& k) {
+            return shared_values.get_or_load(k, loader).then([&] (auto entry_ptr) {
+                anchors_list.emplace_back(std::move(entry_ptr));
+            });
+        }).get();
+
+        // "loader" must be called once for each key
+        BOOST_REQUIRE_EQUAL(load_count, num_loaders);
+        BOOST_REQUIRE_EQUAL(shared_values.size(), num_loaders);
+        anchors_list.clear();
+    });
+}
+
+SEASTAR_TEST_CASE(test_loading_shared_values_rehash) {
+    return seastar::async([] {
+        std::vector<int> ivec(num_loaders);
+        load_count = 0;
+        utils::loading_shared_values<int, sstring> shared_values;
+        std::list<typename utils::loading_shared_values<int, sstring>::entry_ptr> anchors_list;
+
+        prepare().get();
+
+        std::iota(ivec.begin(), ivec.end(), 0);
+
+        // verify that load factor is always in the (0.25, 0.75) range
+        for (int k = 0; k < num_loaders; ++k) {
+            shared_values.get_or_load(k, loader).then([&] (auto entry_ptr) {
+                anchors_list.emplace_back(std::move(entry_ptr));
+            }).get();
+            BOOST_REQUIRE_LE(shared_values.size(), 3 * shared_values.buckets_count() / 4);
+        }
+
+        BOOST_REQUIRE_GE(shared_values.size(), shared_values.buckets_count() / 4);
+
+        // minimum buckets count (by default) is 16, so don't check for less than 4 elements
+        for (int k = 0; k < num_loaders - 4; ++k) {
+            anchors_list.pop_back();
+            shared_values.rehash();
+            BOOST_REQUIRE_GE(shared_values.size(), shared_values.buckets_count() / 4);
+        }
+
+        anchors_list.clear();
+    });
+}
+
+SEASTAR_TEST_CASE(test_loading_shared_values_parallel_loading_explicit_eviction) {
+    return seastar::async([] {
+        std::vector<int> ivec(num_loaders);
+        load_count = 0;
+        utils::loading_shared_values<int, sstring> shared_values;
+        std::vector<typename utils::loading_shared_values<int, sstring>::entry_ptr> anchors_vec(num_loaders);
+
+        prepare().get();
+
+        std::iota(ivec.begin(), ivec.end(), 0);
+
+        parallel_for_each(ivec, [&] (int& k) {
+            return shared_values.get_or_load(k, loader).then([&] (auto entry_ptr) {
+                anchors_vec[k] = std::move(entry_ptr);
+            });
+        }).get();
+
+        int rand_key = rand_int(num_loaders);
+        BOOST_REQUIRE(shared_values.find(rand_key) != shared_values.end());
+        anchors_vec[rand_key] = nullptr;
+        BOOST_REQUIRE_MESSAGE(shared_values.find(rand_key) == shared_values.end(), format("explicit removal for key {} failed", rand_key));
+        anchors_vec.clear();
+    });
+}
+
+SEASTAR_TEST_CASE(test_loading_cache_loading_same_key) {
+    return seastar::async([] {
+        using namespace std::chrono;
+        std::vector<int> ivec(num_loaders);
+        load_count = 0;
+        utils::loading_cache<int, sstring> loading_cache(num_loaders, 1s, test_logger);
+
+        prepare().get();
+
+        std::fill(ivec.begin(), ivec.end(), 0);
+
+        parallel_for_each(ivec, [&] (int& k) {
+            return loading_cache.get_ptr(k, loader).discard_result();
+        }).get();
+
+        // "loader" must be called exactly once
+        BOOST_REQUIRE_EQUAL(load_count, 1);
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), 1);
+        loading_cache.stop().get();
+    });
+}
+
+SEASTAR_TEST_CASE(test_loading_cache_loading_different_keys) {
+    return seastar::async([] {
+        using namespace std::chrono;
+        std::vector<int> ivec(num_loaders);
+        load_count = 0;
+        utils::loading_cache<int, sstring> loading_cache(num_loaders, 1s, test_logger);
+
+        prepare().get();
+
+        std::iota(ivec.begin(), ivec.end(), 0);
+
+        parallel_for_each(ivec, [&] (int& k) {
+            return loading_cache.get_ptr(k, loader).discard_result();
+        }).get();
+
+        BOOST_REQUIRE_EQUAL(load_count, num_loaders);
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), num_loaders);
+        loading_cache.stop().get();
+    });
+}
+
+SEASTAR_TEST_CASE(test_loading_cache_loading_expiry_eviction) {
+    return seastar::async([] {
+        using namespace std::chrono;
+        utils::loading_cache<int, sstring> loading_cache(num_loaders, 20ms, test_logger);
+
+        prepare().get();
+
+        loading_cache.get_ptr(0, loader).discard_result().get();
+
+        BOOST_REQUIRE(loading_cache.find(0) != loading_cache.end());
+
+        // timers get delayed sometimes (especially in a debug mode)
+        constexpr int max_retry = 10;
+        int i = 0;
+        do_until(
+            [&] { return i++ > max_retry || loading_cache.find(0) == loading_cache.end(); },
+            [] { return sleep(40ms); }
+        ).get();
+        BOOST_REQUIRE(loading_cache.find(0) == loading_cache.end());
+        loading_cache.stop().get();
+    });
+}
+
+SEASTAR_TEST_CASE(test_loading_cache_loading_reloading) {
+    return seastar::async([] {
+        using namespace std::chrono;
+        load_count = 0;
+        utils::loading_cache<int, sstring, utils::loading_cache_reload_enabled::yes> loading_cache(num_loaders, 100ms, 20ms, test_logger, loader);
+        prepare().get();
+        loading_cache.get_ptr(0, loader).discard_result().get();
+        sleep(60ms).get();
+        BOOST_REQUIRE_MESSAGE(load_count >= 2, format("load_count is {}", load_count));
+        loading_cache.stop().get();
+    });
+}
+
+SEASTAR_TEST_CASE(test_loading_cache_max_size_eviction) {
+    return seastar::async([] {
+        using namespace std::chrono;
+        load_count = 0;
+        utils::loading_cache<int, sstring> loading_cache(1, 1s, test_logger);
+
+        prepare().get();
+
+        for (int i = 0; i < num_loaders; ++i) {
+            loading_cache.get_ptr(i % 2, loader).discard_result().get();
+        }
+
+        BOOST_REQUIRE_EQUAL(load_count, num_loaders);
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), 1);
+        loading_cache.stop().get();
+    });
+}
+
+SEASTAR_TEST_CASE(test_loading_cache_reload_during_eviction) {
+    return seastar::async([] {
+        using namespace std::chrono;
+        load_count = 0;
+        utils::loading_cache<int, sstring, utils::loading_cache_reload_enabled::yes> loading_cache(1, 100ms, 10ms, test_logger, loader);
+
+        prepare().get();
+
+        auto curr_time = lowres_clock::now();
+        int i = 0;
+
+        // this will cause reloading when values are being actively evicted due to the limited cache size
+        do_until(
+            [&] { return lowres_clock::now() - curr_time > 1s; },
+            [&] { return loading_cache.get_ptr(i++ % 2).discard_result(); }
+        ).get();
+
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), 1);
+        loading_cache.stop().get();
+    });
+}
--- a/tests/logalloc_test.cc
+++ b/tests/logalloc_test.cc
@@ -1194,3 +1194,39 @@ SEASTAR_TEST_CASE(test_reclaiming_runs_as_long_as_there_is_soft_pressure) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_zone_reclaiming_preserves_free_size) {
+    return seastar::async([] {
+        region r;
+        with_allocator(r.allocator(), [&] {
+            chunked_fifo<managed_bytes> objs;
+
+            auto zone_size = max_zone_segments * segment_size;
+
+            // We need to generate 3 zones, so that at least one zone (not last) can be released fully. The first
+            // zone would not due to emergency reserve.
+            while (logalloc::shard_tracker().region_occupancy().used_space() < zone_size * 2 + zone_size / 4) {
+                objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), 1024));
+            }
+
+            BOOST_TEST_MESSAGE(logalloc::shard_tracker().non_lsa_used_space());
+            BOOST_TEST_MESSAGE(logalloc::shard_tracker().region_occupancy());
+
+            while (logalloc::shard_tracker().region_occupancy().used_space() >= logalloc::segment_size * 2) {
+                objs.pop_front();
+            }
+
+            BOOST_TEST_MESSAGE(logalloc::shard_tracker().non_lsa_used_space());
+            BOOST_TEST_MESSAGE(logalloc::shard_tracker().region_occupancy());
+
+            auto before = logalloc::shard_tracker().non_lsa_used_space();
+            logalloc::shard_tracker().reclaim(logalloc::segment_size);
+            auto after = logalloc::shard_tracker().non_lsa_used_space();
+
+            BOOST_TEST_MESSAGE(logalloc::shard_tracker().non_lsa_used_space());
+            BOOST_TEST_MESSAGE(logalloc::shard_tracker().region_occupancy());
+
+            BOOST_REQUIRE(after <= before);
+        });
+    });
+}
--- a/tests/mutation_reader_test.cc
+++ b/tests/mutation_reader_test.cc
@@ -26,6 +26,7 @@
 #include "tests/test-utils.hh"
 #include "tests/mutation_assertions.hh"
 #include "tests/mutation_reader_assertions.hh"
+#include "tests/test_services.hh"

 #include "mutation_reader.hh"
 #include "core/do_with.hh"
--- a/tests/mutation_source_test.cc
+++ b/tests/mutation_source_test.cc
@@ -259,8 +259,9 @@ static void test_fast_forwarding_across_partitions_to_empty_range(populate_fn po

    mutation_source ms = populate(s, partitions);

+    auto pr = dht::partition_range::make({keys[0]}, {keys[1]});
    mutation_reader rd = ms(s,
-        dht::partition_range::make({keys[0]}, {keys[1]}),
+        pr,
        query::full_slice,
        default_priority_class(),
        nullptr,
@@ -280,14 +281,16 @@ static void test_fast_forwarding_across_partitions_to_empty_range(populate_fn po
            // ...don't finish consumption to leave the reader in the middle of partition
    }

-    rd.fast_forward_to(dht::partition_range::make({missing_key}, {missing_key})).get();
+    pr = dht::partition_range::make({missing_key}, {missing_key});
+    rd.fast_forward_to(pr).get();

    {
        streamed_mutation_opt smo = rd().get0();
        BOOST_REQUIRE(!smo);
    }

-    rd.fast_forward_to(dht::partition_range::make({keys[3]}, {keys[3]})).get();
+    pr = dht::partition_range::make({keys[3]}, {keys[3]});
+    rd.fast_forward_to(pr).get();

    {
        streamed_mutation_opt smo = rd().get0();
@@ -303,7 +306,8 @@ static void test_fast_forwarding_across_partitions_to_empty_range(populate_fn po
        BOOST_REQUIRE(!smo);
    }

-    rd.fast_forward_to(dht::partition_range::make_starting_with({keys[keys.size() - 1]})).get();
+    pr = dht::partition_range::make_starting_with({keys[keys.size() - 1]});
+    rd.fast_forward_to(pr).get();

    {
        streamed_mutation_opt smo = rd().get0();
@@ -314,7 +318,8 @@ static void test_fast_forwarding_across_partitions_to_empty_range(populate_fn po
        // ...don't finish consumption to leave the reader in the middle of partition
    }

-    rd.fast_forward_to(dht::partition_range::make({key_after_all}, {key_after_all})).get();
+    pr = dht::partition_range::make({key_after_all}, {key_after_all});
+    rd.fast_forward_to(pr).get();

    {
        streamed_mutation_opt smo = rd().get0();
@@ -1274,7 +1279,7 @@ public:
                set_random_cells(row.cells(), column_kind::regular_column);
                row.marker() = random_row_marker();
            } else {
-                m.partition().clustered_row(*_schema, ckey, is_dummy::yes, continuous);
+                m.partition().clustered_row(*_schema, position_in_partition::after_all_clustered_rows(), is_dummy::yes, continuous);
            }
        }

--- a/tests/mutation_test.cc
+++ b/tests/mutation_test.cc
@@ -44,6 +44,7 @@
 #include "tests/mutation_assertions.hh"
 #include "tests/mutation_reader_assertions.hh"
 #include "tests/result_set_assertions.hh"
+#include "tests/test_services.hh"
 #include "mutation_source_test.hh"
 #include "cell_locking.hh"

@@ -279,6 +280,7 @@ SEASTAR_TEST_CASE(test_list_mutations) {

 SEASTAR_TEST_CASE(test_multiple_memtables_one_partition) {
    return seastar::async([] {
+    storage_service_for_tests ssft;
    auto s = make_lw_shared(schema({}, some_keyspace, some_column_family,
        {{"p1", utf8_type}}, {{"c1", int32_type}}, {{"r1", int32_type}}, {}, utf8_type));

@@ -343,6 +345,7 @@ SEASTAR_TEST_CASE(test_flush_in_the_middle_of_a_scan) {

    return with_column_family(s, cfg, [s](column_family& cf) {
        return seastar::async([s, &cf] {
+            storage_service_for_tests ssft;
            // populate
            auto new_key = [&] {
                static thread_local int next = 0;
@@ -406,6 +409,7 @@ SEASTAR_TEST_CASE(test_flush_in_the_middle_of_a_scan) {
 }

 SEASTAR_TEST_CASE(test_multiple_memtables_multiple_partitions) {
+    return seastar::async([] {
    auto s = make_lw_shared(schema({}, some_keyspace, some_column_family,
            {{"p1", int32_type}}, {{"c1", int32_type}}, {{"r1", int32_type}}, {}, utf8_type));

@@ -416,7 +420,7 @@ SEASTAR_TEST_CASE(test_multiple_memtables_multiple_partitions) {
    cfg.enable_disk_writes = false;
    cfg.enable_incremental_backups = false;
    cfg.cf_stats = &*cf_stats;
-    return with_column_family(s, cfg, [s] (auto& cf) mutable {
+    with_column_family(s, cfg, [s] (auto& cf) mutable {
        std::map<int32_t, std::map<int32_t, int32_t>> shadow, result;

        const column_definition& r1_col = *s->get_column_definition("r1");
@@ -456,7 +460,8 @@ SEASTAR_TEST_CASE(test_multiple_memtables_multiple_partitions) {
                BOOST_REQUIRE(shadow == result);
            });
        });
-    }).then([cf_stats] {});
+    }).then([cf_stats] {}).get();
+    });
 }

 SEASTAR_TEST_CASE(test_cell_ordering) {
--- a/tests/range_tombstone_list_test.cc
+++ b/tests/range_tombstone_list_test.cc
@@ -592,6 +592,24 @@ BOOST_AUTO_TEST_CASE(test_add_overlapping_range_to_range_with_empty_end) {
    BOOST_REQUIRE(it == l.end());
 }

+// Reproduces https://github.com/scylladb/scylla/issues/3083
+BOOST_AUTO_TEST_CASE(test_coalescing_with_end_bound_inclusiveness_change_with_prefix_bound) {
+    range_tombstone_list l(*s);
+
+    auto rt1 = rtie(4, 8, 4);
+    auto rt2 = range_tombstone(key({8, 1}), bound_kind::incl_start, key({10}), bound_kind::excl_end, {1, gc_now});
+
+    l.apply(*s, rt1);
+    l.apply(*s, rt2);
+
+    l.apply(*s, rt(1, 5, 4));
+
+    auto it = l.begin();
+    assert_rt(rtie(1, 8, 4), *it++);
+    assert_rt(rt2, *it++);
+    BOOST_REQUIRE(it == l.end());
+}
+
 BOOST_AUTO_TEST_CASE(test_search_with_empty_start) {
    range_tombstone_list l(*s);

--- a/tests/row_cache_test.cc
+++ b/tests/row_cache_test.cc
@@ -1886,3 +1886,47 @@ SEASTAR_TEST_CASE(test_concurrent_population_before_latest_version_iterator) {
        }
    });
 }
+
+SEASTAR_TEST_CASE(test_tombstone_merging_of_overlapping_tombstones_in_many_versions) {
+    return seastar::async([] {
+        simple_schema s;
+        cache_tracker tracker;
+        memtable_snapshot_source underlying(s.schema());
+
+        auto pk = s.make_pkey(0);
+        auto pr = dht::partition_range::make_singular(pk);
+
+        mutation m1(pk, s.schema());
+        m1.partition().apply_delete(*s.schema(),
+            s.make_range_tombstone(s.make_ckey_range(2, 107), s.new_tombstone()));
+        s.add_row(m1, s.make_ckey(5), "val");
+
+        // What is important here is that it contains a newer range tombstone
+        // which trims [2, 107] from m1 into (100, 107], which starts after ck=5.
+        mutation m2(pk, s.schema());
+        m2.partition().apply_delete(*s.schema(),
+            s.make_range_tombstone(s.make_ckey_range(1, 100), s.new_tombstone()));
+
+        row_cache cache(s.schema(), snapshot_source([&] { return underlying(); }), tracker);
+
+        auto make_sm = [&] {
+            auto rd = cache.make_reader(s.schema());
+            auto smo = rd().get0();
+            BOOST_REQUIRE(smo);
+            streamed_mutation& sm = *smo;
+            sm.set_max_buffer_size(1);
+            return std::move(sm);
+        };
+
+        apply(cache, underlying, m1);
+        populate_range(cache, pr, s.make_ckey_range(0, 3));
+
+        auto sm1 = make_sm();
+
+        apply(cache, underlying, m2);
+
+        assert_that(cache.make_reader(s.schema()))
+            .produces(m1 + m2)
+            .produces_end_of_stream();
+    });
+}
--- a/tests/schema_change_test.cc
+++ b/tests/schema_change_test.cc
@@ -408,7 +408,7 @@ SEASTAR_TEST_CASE(test_prepared_statement_is_invalidated_by_schema_change) {
            logging::logger_registry().set_logger_level("query_processor", logging::log_level::debug);
            e.execute_cql("create keyspace tests with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };").get();
            e.execute_cql("create table tests.table1 (pk int primary key, c1 int, c2 int);").get();
-            bytes id = e.prepare("select * from tests.table1;").get0();
+            auto id = e.prepare("select * from tests.table1;").get0();

            e.execute_cql("alter table tests.table1 add s1 int;").get();

--- a/tests/simple_schema.hh
+++ b/tests/simple_schema.hh
@@ -43,12 +43,16 @@ public:
    api::timestamp_type new_timestamp() {
        return _timestamp++;
    }
+    tombstone new_tombstone() {
+        return {new_timestamp(), gc_clock::now()};
+    }
 public:
-    simple_schema()
+    using with_static = bool_class<class static_tag>;
+    simple_schema(with_static ws = with_static::yes)
        : _s(schema_builder("ks", "cf")
            .with_column("pk", utf8_type, column_kind::partition_key)
            .with_column("ck", utf8_type, column_kind::clustering_key)
-            .with_column("s1", utf8_type, column_kind::static_column)
+            .with_column("s1", utf8_type, ws ? column_kind::static_column : column_kind::regular_column)
            .with_column("v", utf8_type)
            .build())
        , _v_def(*_s->get_column_definition(to_bytes("v")))
--- a/tests/sstable_assertions.hh
+++ b/tests/sstable_assertions.hh
@@ -52,7 +52,7 @@ public:
                auto& prev = pi->entries[0];
                for (size_t i = 1; i < pi->entries.size(); ++i) {
                    auto& cur = pi->entries[i];
-                    if (!pos_cmp(prev.end, cur.start)) {
+                    if (pos_cmp(cur.start, prev.end)) {
                        std::cout << "promoted index:\n";
                        for (auto& e : pi->entries) {
                            std::cout << "  " << e.start << "-" << e.end << ": +" << e.offset << " len=" << e.width << std::endl;
@@ -66,6 +66,16 @@ public:
        }
        return *this;
    }
+
+    index_reader_assertions& is_empty(const schema& s) {
+        _r->read_partition_data().get();
+        while (!_r->eof()) {
+            auto* pi = _r->current_partition_entry().get_promoted_index(s);
+            BOOST_REQUIRE(pi == nullptr);
+            _r->advance_to_next_partition().get();
+        }
+        return *this;
+    }
 };

 inline
--- a/tests/sstable_datafile_test.cc
+++ b/tests/sstable_datafile_test.cc
@@ -47,6 +47,7 @@
 #include "cell_locking.hh"
 #include "simple_schema.hh"
 #include "memtable-sstable.hh"
+#include "tests/sstable_assertions.hh"

 #include <stdio.h>
 #include <ftw.h>
@@ -1009,6 +1010,8 @@ static ::mutation_reader sstable_reader(shared_sstable sst, schema_ptr s, const
 }

 SEASTAR_TEST_CASE(compaction_manager_test) {
+  return seastar::async([] {
+    storage_service_for_tests ssft;
    BOOST_REQUIRE(smp::count == 1);
    auto s = make_lw_shared(schema({}, some_keyspace, some_column_family,
        {{"p1", utf8_type}}, {{"c1", utf8_type}}, {{"r1", int32_type}}, {}, utf8_type));
@@ -1030,7 +1033,7 @@ SEASTAR_TEST_CASE(compaction_manager_test) {

    auto generations = make_lw_shared<std::vector<unsigned long>>({1, 2, 3, 4});

-    return do_for_each(*generations, [generations, cf, cm, s, tmp] (unsigned long generation) {
+    do_for_each(*generations, [generations, cf, cm, s, tmp] (unsigned long generation) {
        // create 4 sstables of similar size to be compacted later on.

        auto mt = make_lw_shared<memtable>(s);
@@ -1083,7 +1086,8 @@ SEASTAR_TEST_CASE(compaction_manager_test) {
        });
    }).finally([s, cm, tmp, cl_stats] {
        return cm->stop().then([cm] {});
-    });
+    }).get();
+  });
 }

 SEASTAR_TEST_CASE(compact) {
@@ -1650,8 +1654,6 @@ SEASTAR_TEST_CASE(datafile_generation_47) {
 SEASTAR_TEST_CASE(test_counter_write) {
    return test_setup::do_with_test_directory([] {
        return seastar::async([] {
-            storage_service_for_tests ssft;
-
            auto s = schema_builder(some_keyspace, some_column_family)
                    .with_column("p1", utf8_type, column_kind::partition_key)
                    .with_column("c1", utf8_type, column_kind::clustering_key)
@@ -2280,6 +2282,7 @@ static shared_sstable make_sstable_containing(std::function<shared_sstable()> ss
 SEASTAR_TEST_CASE(tombstone_purge_test) {
    BOOST_REQUIRE(smp::count == 1);
    return seastar::async([] {
+        storage_service_for_tests ssft;
        cell_locker_stats cl_stats;

        // In a column family with gc_grace_seconds set to 0, check that a tombstone
@@ -3175,6 +3178,7 @@ static void test_min_max_clustering_key(schema_ptr s, std::vector<bytes> explode

 SEASTAR_TEST_CASE(min_max_clustering_key_test) {
    return seastar::async([] {
+        storage_service_for_tests ssft;
        {
            auto s = schema_builder("ks", "cf")
                .with_column("pk", utf8_type, column_kind::partition_key)
@@ -3222,6 +3226,7 @@ SEASTAR_TEST_CASE(min_max_clustering_key_test) {

 SEASTAR_TEST_CASE(min_max_clustering_key_test_2) {
    return seastar::async([] {
+        storage_service_for_tests ssft;
        auto s = schema_builder("ks", "cf")
            .with_column("pk", utf8_type, column_kind::partition_key)
            .with_column("ck1", utf8_type, column_kind::clustering_key)
@@ -3270,6 +3275,7 @@ SEASTAR_TEST_CASE(min_max_clustering_key_test_2) {

 SEASTAR_TEST_CASE(sstable_tombstone_metadata_check) {
    return seastar::async([] {
+        storage_service_for_tests ssft;
        auto s = schema_builder("ks", "cf")
            .with_column("pk", utf8_type, column_kind::partition_key)
            .with_column("ck1", utf8_type, column_kind::clustering_key)
@@ -3439,6 +3445,7 @@ shared_sstable make_sstable(sstring path, streamed_mutation sm, sstable_writer_c

 SEASTAR_TEST_CASE(test_repeated_tombstone_skipping) {
    return seastar::async([] {
+        storage_service_for_tests ssft;
        simple_schema table;

        std::vector<mutation_fragment> fragments;
@@ -3506,6 +3513,7 @@ uint64_t consume_all(streamed_mutation& sm) {

 SEASTAR_TEST_CASE(test_skipping_using_index) {
    return seastar::async([] {
+        storage_service_for_tests ssft;
        simple_schema table;

        const unsigned rows_per_part = 10;
@@ -3843,3 +3851,63 @@ SEASTAR_TEST_CASE(test_wrong_counter_shard_order) {
            BOOST_REQUIRE(!reader().get0());
        });
 }
+
+SEASTAR_TEST_CASE(test_broken_promoted_index_is_skipped) {
+    // create table ks.test (pk int, ck int, v int, primary key(pk, ck)) with compact storage;
+    //
+    // Populated with:
+    //
+    // insert into ks.test (pk, ck, v) values (1, 1, 1);
+    // insert into ks.test (pk, ck, v) values (1, 2, 1);
+    // insert into ks.test (pk, ck, v) values (1, 3, 1);
+    // delete from ks.test where pk = 1 and ck = 2;
+    return seastar::async([] {
+        auto s = schema_builder("ks", "test")
+                .with_column("pk", int32_type, column_kind::partition_key)
+                .with_column("ck", int32_type, column_kind::clustering_key)
+                .with_column("v", int32_type)
+                .build(schema_builder::compact_storage::yes);
+
+        auto sst = sstables::make_sstable(s, "tests/sstables/broken_non_compound_pi_and_range_tombstone", 1, sstables::sstable::version_types::ka, big);
+        sst->load().get0();
+
+        {
+            assert_that(sst->get_index_reader(default_priority_class())).is_empty(*s);
+        }
+    });
+}
+
+SEASTAR_TEST_CASE(test_old_format_non_compound_range_tombstone_is_read) {
+    // create table ks.test (pk int, ck int, v int, primary key(pk, ck)) with compact storage;
+    //
+    // Populated with:
+    //
+    // insert into ks.test (pk, ck, v) values (1, 1, 1);
+    // insert into ks.test (pk, ck, v) values (1, 2, 1);
+    // insert into ks.test (pk, ck, v) values (1, 3, 1);
+    // delete from ks.test where pk = 1 and ck = 2;
+    return seastar::async([] {
+        auto s = schema_builder("ks", "test")
+                .with_column("pk", int32_type, column_kind::partition_key)
+                .with_column("ck", int32_type, column_kind::clustering_key)
+                .with_column("v", int32_type)
+                .build(schema_builder::compact_storage::yes);
+
+        auto sst = sstables::make_sstable(s, "tests/sstables/broken_non_compound_pi_and_range_tombstone", 1, sstables::sstable::version_types::ka, big);
+        sst->load().get0();
+
+        auto pk = partition_key::from_exploded(*s, { int32_type->decompose(1) });
+        auto dk = dht::global_partitioner().decorate_key(*s, pk);
+        auto ck = clustering_key::from_exploded(*s, {int32_type->decompose(2)});
+        mutation m(dk, s);
+        m.set_clustered_cell(ck, *s->get_column_definition("v"), atomic_cell::make_live(1511270919978349, int32_type->decompose(1), { }));
+        m.partition().apply_delete(*s, ck, {1511270943827278, gc_clock::from_time_t(1511270943)});
+
+        {
+            auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_singular({ck})).build();
+            assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
+                    .produces(m)
+                    .produces_end_of_stream();
+        }
+    });
+}
--- a/tests/sstable_mutation_test.cc
+++ b/tests/sstable_mutation_test.cc
@@ -32,10 +32,12 @@
 #include "mutation_reader.hh"
 #include "mutation_reader_assertions.hh"
 #include "mutation_source_test.hh"
+#include "partition_slice_builder.hh"
 #include "tmpdir.hh"
 #include "memtable-sstable.hh"
 #include "disk-error-handler.hh"
 #include "tests/sstable_assertions.hh"
+#include "tests/test_services.hh"

 thread_local disk_error_signal_type commit_error;
 thread_local disk_error_signal_type general_disk_error;
@@ -386,6 +388,7 @@ void test_mutation_source(sstable_writer_config cfg, sstables::sstable::version_

 SEASTAR_TEST_CASE(test_sstable_conforms_to_mutation_source) {
    return seastar::async([] {
+        storage_service_for_tests ssft;
        for (auto version : {sstables::sstable::version_types::ka, sstables::sstable::version_types::la}) {
            for (auto index_block_size : {1, 128, 64*1024}) {
                sstable_writer_config cfg;
@@ -398,6 +401,7 @@ SEASTAR_TEST_CASE(test_sstable_conforms_to_mutation_source) {

 SEASTAR_TEST_CASE(test_sstable_can_write_and_read_range_tombstone) {
    return seastar::async([] {
+        storage_service_for_tests ssft;
        auto dir = make_lw_shared<tmpdir>();
        auto s = make_lw_shared(schema({}, "ks", "cf",
            {{"p1", utf8_type}}, {{"c1", int32_type}}, {{"r1", int32_type}}, {}, utf8_type));
@@ -772,6 +776,7 @@ SEASTAR_TEST_CASE(tombstone_in_tombstone2) {

 SEASTAR_TEST_CASE(test_non_compound_table_row_is_not_marked_as_static) {
    return seastar::async([] {
+        storage_service_for_tests ssft;
        auto dir = make_lw_shared<tmpdir>();
        schema_builder builder("ks", "cf");
        builder.with_column("p", utf8_type, column_kind::partition_key);
@@ -805,6 +810,7 @@ SEASTAR_TEST_CASE(test_non_compound_table_row_is_not_marked_as_static) {

 SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic) {
    return seastar::async([] {
+        storage_service_for_tests ssft;
        auto dir = make_lw_shared<tmpdir>();
        schema_builder builder("ks", "cf");
        builder.with_column("p", utf8_type, column_kind::partition_key);
@@ -851,3 +857,283 @@ SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic) {
        assert_that(sst->get_index_reader(default_priority_class())).has_monotonic_positions(*s);
    });
 }
+
+SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic_compound_dense) {
+    return seastar::async([] {
+        storage_service_for_tests ssft;
+        auto dir = make_lw_shared<tmpdir>();
+        schema_builder builder("ks", "cf");
+        builder.with_column("p", utf8_type, column_kind::partition_key);
+        builder.with_column("c1", int32_type, column_kind::clustering_key);
+        builder.with_column("c2", int32_type, column_kind::clustering_key);
+        builder.with_column("v", int32_type);
+        auto s = builder.build(schema_builder::compact_storage::yes);
+
+        auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
+        auto cell = atomic_cell::make_live(1, int32_type->decompose(88), { });
+        mutation m(dk, s);
+
+        auto ck1 = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(2)});
+        m.set_clustered_cell(ck1, *s->get_column_definition("v"), cell);
+
+        auto ck2 = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(4)});
+        m.set_clustered_cell(ck2, *s->get_column_definition("v"), cell);
+
+        auto ck3 = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(6)});
+        m.set_clustered_cell(ck3, *s->get_column_definition("v"), cell);
+
+        auto ck4 = clustering_key::from_exploded(*s, {int32_type->decompose(3), int32_type->decompose(9)});
+        m.set_clustered_cell(ck4, *s->get_column_definition("v"), cell);
+
+        m.partition().apply_row_tombstone(*s, range_tombstone(
+                clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}),
+                bound_kind::incl_start,
+                clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}),
+                bound_kind::incl_end,
+                {1, gc_clock::now()}));
+
+        auto mt = make_lw_shared<memtable>(s);
+        mt->apply(std::move(m));
+
+        auto sst = sstables::make_sstable(s,
+                                          dir->path,
+                                          1 /* generation */,
+                                          sstables::sstable::version_types::ka,
+                                          sstables::sstable::format_types::big);
+        sstable_writer_config cfg;
+        cfg.promoted_index_block_size = 1;
+        sst->write_components(mt->make_reader(s), 1, s, cfg).get();
+        sst->load().get();
+
+        {
+            assert_that(sst->get_index_reader(default_priority_class())).has_monotonic_positions(*s);
+        }
+
+        {
+            auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_starting_with({ck1})).build();
+            assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
+                    .produces(m)
+                    .produces_end_of_stream();
+        }
+    });
+}
+
+SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic_non_compound_dense) {
+    return seastar::async([] {
+        storage_service_for_tests ssft;
+        auto dir = make_lw_shared<tmpdir>();
+        schema_builder builder("ks", "cf");
+        builder.with_column("p", utf8_type, column_kind::partition_key);
+        builder.with_column("c1", int32_type, column_kind::clustering_key);
+        builder.with_column("v", int32_type);
+        auto s = builder.build(schema_builder::compact_storage::yes);
+
+        auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
+        auto cell = atomic_cell::make_live(1, int32_type->decompose(88), { });
+        mutation m(dk, s);
+
+        auto ck1 = clustering_key::from_exploded(*s, {int32_type->decompose(1)});
+        m.set_clustered_cell(ck1, *s->get_column_definition("v"), cell);
+
+        auto ck2 = clustering_key::from_exploded(*s, {int32_type->decompose(2)});
+        m.set_clustered_cell(ck2, *s->get_column_definition("v"), cell);
+
+        auto ck3 = clustering_key::from_exploded(*s, {int32_type->decompose(3)});
+        m.set_clustered_cell(ck3, *s->get_column_definition("v"), cell);
+
+        m.partition().apply_row_tombstone(*s, range_tombstone(
+                clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}),
+                bound_kind::incl_start,
+                clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}),
+                bound_kind::incl_end,
+                {1, gc_clock::now()}));
+
+        auto mt = make_lw_shared<memtable>(s);
+        mt->apply(std::move(m));
+
+        auto sst = sstables::make_sstable(s,
+                                          dir->path,
+                                          1 /* generation */,
+                                          sstables::sstable::version_types::ka,
+                                          sstables::sstable::format_types::big);
+        sstable_writer_config cfg;
+        cfg.promoted_index_block_size = 1;
+        sst->write_components(mt->make_reader(s), 1, s, cfg).get();
+        sst->load().get();
+
+        {
+            assert_that(sst->get_index_reader(default_priority_class())).has_monotonic_positions(*s);
+        }
+
+        {
+            auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_starting_with({ck1})).build();
+            assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
+                    .produces(m)
+                    .produces_end_of_stream();
+        }
+    });
+}
+
+SEASTAR_TEST_CASE(test_promoted_index_repeats_open_tombstones) {
+    return seastar::async([] {
+        storage_service_for_tests ssft;
+        auto dir = make_lw_shared<tmpdir>();
+        int id = 0;
+        for (auto& compact : { schema_builder::compact_storage::no, schema_builder::compact_storage::yes }) {
+            schema_builder builder("ks", sprint("cf%d", id++));
+            builder.with_column("p", utf8_type, column_kind::partition_key);
+            builder.with_column("c1", bytes_type, column_kind::clustering_key);
+            builder.with_column("v", int32_type);
+            auto s = builder.build(compact);
+
+            auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
+            auto cell = atomic_cell::make_live(1, int32_type->decompose(88), { });
+            mutation m(dk, s);
+
+            m.partition().apply_row_tombstone(*s, range_tombstone(
+                    clustering_key_prefix::from_exploded(*s, {bytes_type->decompose(data_value(to_bytes("ck1")))}),
+                    bound_kind::incl_start,
+                    clustering_key_prefix::from_exploded(*s, {bytes_type->decompose(data_value(to_bytes("ck5")))}),
+                    bound_kind::incl_end,
+                    {1, gc_clock::now()}));
+
+            auto ck = clustering_key::from_exploded(*s, {bytes_type->decompose(data_value(to_bytes("ck3")))});
+            m.set_clustered_cell(ck, *s->get_column_definition("v"), cell);
+
+            auto mt = make_lw_shared<memtable>(s);
+            mt->apply(m);
+
+            auto sst = sstables::make_sstable(s,
+                                              dir->path,
+                                              1 /* generation */,
+                                              sstables::sstable::version_types::ka,
+                                              sstables::sstable::format_types::big);
+            sstable_writer_config cfg;
+            cfg.promoted_index_block_size = 1;
+            sst->write_components(mt->make_reader(s), 1, s, cfg).get();
+            sst->load().get();
+
+            {
+                auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_starting_with({ck})).build();
+                assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
+                        .produces(m)
+                        .produces_end_of_stream();
+            }
+        }
+    });
+}
+
+SEASTAR_TEST_CASE(test_range_tombstones_are_correctly_seralized_for_non_compound_dense_schemas) {
+    return seastar::async([] {
+        storage_service_for_tests ssft;
+        auto dir = make_lw_shared<tmpdir>();
+        schema_builder builder("ks", "cf");
+        builder.with_column("p", utf8_type, column_kind::partition_key);
+        builder.with_column("c", int32_type, column_kind::clustering_key);
+        builder.with_column("v", int32_type);
+        auto s = builder.build(schema_builder::compact_storage::yes);
+
+        auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
+        mutation m(dk, s);
+
+        m.partition().apply_row_tombstone(*s, range_tombstone(
+                clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}),
+                bound_kind::incl_start,
+                clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}),
+                bound_kind::incl_end,
+                {1, gc_clock::now()}));
+
+        auto mt = make_lw_shared<memtable>(s);
+        mt->apply(m);
+
+        auto sst = sstables::make_sstable(s,
+                                          dir->path,
+                                          1 /* generation */,
+                                          sstables::sstable::version_types::ka,
+                                          sstables::sstable::format_types::big);
+        sstable_writer_config cfg;
+        sst->write_components(mt->make_reader(s), 1, s, cfg).get();
+        sst->load().get();
+
+        {
+            auto slice = partition_slice_builder(*s).build();
+            assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
+                    .produces(m)
+                    .produces_end_of_stream();
+        }
+    });
+}
+
+SEASTAR_TEST_CASE(test_promoted_index_is_absent_for_schemas_without_clustering_key) {
+    return seastar::async([] {
+        storage_service_for_tests ssft;
+        auto dir = make_lw_shared<tmpdir>();
+        schema_builder builder("ks", "cf");
+        builder.with_column("p", utf8_type, column_kind::partition_key);
+        builder.with_column("v", int32_type);
+        auto s = builder.build(schema_builder::compact_storage::yes);
+
+        auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
+        mutation m(dk, s);
+        for (auto&& v : { 1, 2, 3, 4 }) {
+            auto cell = atomic_cell::make_live(1, int32_type->decompose(v), { });
+            m.set_clustered_cell(clustering_key_prefix::make_empty(), *s->get_column_definition("v"), cell);
+        }
+        auto mt = make_lw_shared<memtable>(s);
+        mt->apply(m);
+
+        auto sst = sstables::make_sstable(s,
+                                          dir->path,
+                                          1 /* generation */,
+                                          sstables::sstable::version_types::ka,
+                                          sstables::sstable::format_types::big);
+        sstable_writer_config cfg;
+        cfg.promoted_index_block_size = 1;
+        sst->write_components(mt->make_reader(s), 1, s, cfg).get();
+        sst->load().get();
+
+        assert_that(sst->get_index_reader(default_priority_class())).is_empty(*s);
+    });
+}
+
+SEASTAR_TEST_CASE(test_can_write_and_read_non_compound_range_tombstone_as_compound) {
+    return seastar::async([] {
+        storage_service_for_tests ssft;
+        auto dir = make_lw_shared<tmpdir>();
+        schema_builder builder("ks", "cf");
+        builder.with_column("p", utf8_type, column_kind::partition_key);
+        builder.with_column("c", int32_type, column_kind::clustering_key);
+        builder.with_column("v", int32_type);
+        auto s = builder.build(schema_builder::compact_storage::yes);
+
+        auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
+        mutation m(dk, s);
+
+        m.partition().apply_row_tombstone(*s, range_tombstone(
+                clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}),
+                bound_kind::incl_start,
+                clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}),
+                bound_kind::incl_end,
+                {1, gc_clock::now()}));
+
+        auto mt = make_lw_shared<memtable>(s);
+        mt->apply(m);
+
+        auto sst = sstables::make_sstable(s,
+                                          dir->path,
+                                          1 /* generation */,
+                                          sstables::sstable::version_types::ka,
+                                          sstables::sstable::format_types::big);
+        sstable_writer_config cfg;
+        cfg.correctly_serialize_non_compound_range_tombstones = false;
+        sst->write_components(mt->make_reader(s), 1, s, cfg).get();
+        sst->load().get();
+
+        {
+            auto slice = partition_slice_builder(*s).build();
+            assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
+                    .produces(m)
+                    .produces_end_of_stream();
+        }
+    });
+}
--- a/tests/sstable_test.hh
+++ b/tests/sstable_test.hh
@@ -29,6 +29,7 @@
 #include "schema.hh"
 #include "schema_builder.hh"
 #include "core/thread.hh"
+#include "tests/test_services.hh"

 static auto la = sstables::sstable::version_types::la;
 static auto big = sstables::sstable::format_types::big;
@@ -597,12 +598,12 @@ public:
    }

    static future<> do_with_test_directory(std::function<future<> ()>&& fut, sstring p = path()) {
-        return test_setup::create_empty_test_dir(p).then([fut = std::move(fut), p] () mutable {
-            return fut();
-        }).finally([p] {
-            return test_setup::empty_test_dir(p).then([p] {
-                return engine().remove_file(p);
-            });
+        return seastar::async([p, fut = std::move(fut)] {
+            storage_service_for_tests ssft;
+            test_setup::create_empty_test_dir(p).get();
+            fut().get();
+            test_setup::empty_test_dir(p).get();
+            engine().remove_file(p).get();
        });
    }
 };
--- a/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-CompressionInfo.db
+++ b/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-CompressionInfo.db
--- a/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Data.db
+++ b/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Data.db
--- a/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Digest.sha1
+++ b/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Digest.sha1
@@ -0,0 +1 @@
+2104758772
--- a/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Filter.db
+++ b/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Filter.db
--- a/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Index.db
+++ b/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Index.db
--- a/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Scylla.db
+++ b/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Scylla.db
--- a/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Statistics.db
+++ b/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Statistics.db
--- a/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Summary.db
+++ b/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-Summary.db
--- a/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-TOC.txt
+++ b/tests/sstables/broken_non_compound_pi_and_range_tombstone/ks-test-ka-1-TOC.txt
@@ -0,0 +1,9 @@
+Scylla.db
+CompressionInfo.db
+Filter.db
+Statistics.db
+TOC.txt
+Digest.sha1
+Index.db
+Summary.db
+Data.db
--- a/tests/view_schema_test.cc
+++ b/tests/view_schema_test.cc
--- a/thrift/handler.cc
+++ b/thrift/handler.cc
@@ -1002,7 +1002,7 @@ public:

    void execute_prepared_cql3_query(tcxx::function<void(CqlResult const& _return)> cob, tcxx::function<void(::apache::thrift::TDelayedException* _throw)> exn_cob, const int32_t itemId, const std::vector<std::string> & values, const ConsistencyLevel::type consistency) {
        with_exn_cob(std::move(exn_cob), [&] {
-            auto prepared = _query_processor.local().get_prepared_for_thrift(itemId);
+            auto prepared = _query_processor.local().get_prepared(cql3::prepared_cache_key_type(itemId));
            if (!prepared) {
                throw make_exception<InvalidRequestException>("Prepared query with id %d not found", itemId);
            }
--- a/thrift/server.cc
+++ b/thrift/server.cc
@@ -50,6 +50,8 @@ using namespace apache::thrift::protocol;
 using namespace apache::thrift::async;
 using namespace ::cassandra;

+using namespace std::chrono_literals;
+
 class thrift_stats {
    seastar::metrics::metric_groups _metrics;
 public:
@@ -68,8 +70,10 @@ thrift_server::~thrift_server() {
 }

 future<> thrift_server::stop() {
+    auto f = _stop_gate.close();
+    std::for_each(_listeners.begin(), _listeners.end(), std::mem_fn(&server_socket::abort_accept));
    std::for_each(_connections_list.begin(), _connections_list.end(), std::mem_fn(&connection::shutdown));
-    return make_ready_future<>();
+    return f;
 }

 struct handler_deleter {
@@ -101,8 +105,27 @@ thrift_server::connection::connection(thrift_server& server, connected_socket&&
 }

 thrift_server::connection::~connection() {
-    --_server._current_connections;
-    _server._connections_list.erase(_server._connections_list.iterator_to(*this));
+    if (is_linked()) {
+        --_server._current_connections;
+        _server._connections_list.erase(_server._connections_list.iterator_to(*this));
+    }
+}
+
+thrift_server::connection::connection(connection&& other)
+        : _server(other._server)
+        , _fd(std::move(other._fd))
+        , _read_buf(std::move(other._read_buf))
+        , _write_buf(std::move(other._write_buf))
+        , _transport(std::move(other._transport))
+        , _input(std::move(other._input))
+        , _output(std::move(other._output))
+        , _in_proto(std::move(other._in_proto))
+        , _out_proto(std::move(other._out_proto))
+        , _processor(std::move(other._processor)) {
+    if (other.is_linked()) {
+        boost::intrusive::list<connection>::node_algorithms::init(this_ptr());
+        boost::intrusive::list<connection>::node_algorithms::swap_nodes(other.this_ptr(), this_ptr());
+    }
 }

 future<>
@@ -190,29 +213,65 @@ thrift_server::listen(ipv4_addr addr, bool keepalive) {

 void
 thrift_server::do_accepts(int which, bool keepalive) {
-    _listeners[which].accept().then([this, which, keepalive] (connected_socket fd, socket_address addr) mutable {
-        fd.set_nodelay(true);
-        fd.set_keepalive(keepalive);
-        auto conn = new connection(*this, std::move(fd), addr);
-        conn->process().then_wrapped([this, conn] (future<> f) {
-            conn->shutdown();
-            delete conn;
-            try {
-                f.get();
-            } catch (std::exception& ex) {
-                tlogger.debug("request error {}", ex.what());
-            }
+    if (_stop_gate.is_closed()) {
+        return;
+    }
+    with_gate(_stop_gate, [&, this] {
+        return _listeners[which].accept().then([this, which, keepalive] (connected_socket fd, socket_address addr) {
+            fd.set_nodelay(true);
+            fd.set_keepalive(keepalive);
+            with_gate(_stop_gate, [&, this] {
+                return do_with(connection(*this, std::move(fd), addr), [this] (auto& conn) {
+                    return conn.process().then_wrapped([this, &conn] (future<> f) {
+                        conn.shutdown();
+                        try {
+                            f.get();
+                        } catch (std::exception& ex) {
+                            tlogger.debug("request error {}", ex.what());
+                        }
+                    });
+                });
+            });
+            do_accepts(which, keepalive);
+        }).handle_exception([this, which, keepalive] (auto ex) {
+            tlogger.debug("accept failed {}", ex);
+            this->maybe_retry_accept(which, keepalive, std::move(ex));
        });
-        do_accepts(which, keepalive);
-    }).then_wrapped([] (future<> f) {
-        try {
-            f.get();
-        } catch (std::exception& ex) {
-            std::cout << "accept failed: " << ex.what() << "\n";
-        }
    });
 }

+void thrift_server::maybe_retry_accept(int which, bool keepalive, std::exception_ptr ex) {
+    auto retry = [this, which, keepalive] {
+        tlogger.debug("retrying accept after failure");
+        do_accepts(which, keepalive);
+    };
+    auto retry_with_backoff = [&] {
+        // FIXME: Consider using exponential backoff
+        sleep(1ms).then([retry = std::move(retry)] { retry(); });
+    };
+    try {
+        std::rethrow_exception(std::move(ex));
+    } catch (const std::system_error& e) {
+        switch (e.code().value()) {
+            // FIXME: Don't retry for other fatal errors
+            case EBADF:
+                break;
+            case ENFILE:
+            case EMFILE:
+            case ENOMEM:
+                retry_with_backoff();
+            default:
+                retry();
+        }
+    } catch (const std::bad_alloc&) {
+        retry_with_backoff();
+    } catch (const seastar::gate_closed_exception&) {
+        return;
+    } catch (...) {
+        retry();
+    }
+}
+
 uint64_t
 thrift_server::total_connections() const {
    return _total_connections;
--- a/thrift/server.hh
+++ b/thrift/server.hh
@@ -25,6 +25,7 @@
 #include "core/reactor.hh"
 #include "core/distributed.hh"
 #include "cql3/query_processor.hh"
+#include <seastar/core/gate.hh>
 #include <memory>
 #include <cstdint>
 #include <boost/intrusive/list.hpp>
@@ -79,6 +80,7 @@ class thrift_server {
    public:
        connection(thrift_server& server, connected_socket&& fd, socket_address addr);
        ~connection();
+        connection(connection&&);
        future<> process();
        future<> read();
        future<> write();
@@ -96,6 +98,7 @@ private:
    uint64_t _current_connections = 0;
    uint64_t _requests_served = 0;
    boost::intrusive::list<connection> _connections_list;
+    seastar::gate _stop_gate;
 public:
    thrift_server(distributed<database>& db, distributed<cql3::query_processor>& qp);
    ~thrift_server();
@@ -105,6 +108,9 @@ public:
    uint64_t total_connections() const;
    uint64_t current_connections() const;
    uint64_t requests_served() const;
+
+private:
+    void maybe_retry_accept(int which, bool keepalive, std::exception_ptr ex);
 };

 #endif /* APPS_SEASTAR_THRIFT_SERVER_HH_ */
--- a/transport/event_notifier.cc
+++ b/transport/event_notifier.cc
@@ -66,12 +66,12 @@ void cql_server::event_notifier::on_create_keyspace(const sstring& ks_name)
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::CREATED,
                ks_name
            }));
-        });
+        };
    }
 }

@@ -79,14 +79,14 @@ void cql_server::event_notifier::on_create_column_family(const sstring& ks_name,
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::CREATED,
                event::schema_change::target_type::TABLE,
                ks_name,
                cf_name
            }));
-        });
+        };
    }
 }

@@ -94,14 +94,14 @@ void cql_server::event_notifier::on_create_user_type(const sstring& ks_name, con
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::CREATED,
                event::schema_change::target_type::TYPE,
                ks_name,
                type_name
            }));
-        });
+        };
    }
 }

@@ -124,12 +124,12 @@ void cql_server::event_notifier::on_update_keyspace(const sstring& ks_name)
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::UPDATED,
                ks_name
            }));
-        });
+        };
    }
 }

@@ -137,14 +137,14 @@ void cql_server::event_notifier::on_update_column_family(const sstring& ks_name,
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::UPDATED,
                event::schema_change::target_type::TABLE,
                ks_name,
                cf_name
            }));
-        });
+        };
    }
 }

@@ -152,14 +152,14 @@ void cql_server::event_notifier::on_update_user_type(const sstring& ks_name, con
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::UPDATED,
                event::schema_change::target_type::TYPE,
                ks_name,
                type_name
            }));
-        });
+        };
    }
 }

@@ -182,12 +182,12 @@ void cql_server::event_notifier::on_drop_keyspace(const sstring& ks_name)
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::DROPPED,
                ks_name
            }));
-        });
+        };
    }
 }

@@ -195,14 +195,14 @@ void cql_server::event_notifier::on_drop_column_family(const sstring& ks_name, c
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::DROPPED,
                event::schema_change::target_type::TABLE,
                ks_name,
                cf_name
            }));
-        });
+        };
    }
 }

@@ -210,14 +210,14 @@ void cql_server::event_notifier::on_drop_user_type(const sstring& ks_name, const
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::DROPPED,
                event::schema_change::target_type::TYPE,
                ks_name,
                type_name
            }));
-        });
+        };
    }
 }

@@ -240,9 +240,9 @@ void cql_server::event_notifier::on_join_cluster(const gms::inet_address& endpoi
 {
    for (auto&& conn : _topology_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_topology_change_event(event::topology_change::new_node(endpoint, conn->_server_addr.port)));
-        });
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_topology_change_event(event::topology_change::new_node(endpoint, conn->_server_addr.port)));
+        };
    }
 }

@@ -250,9 +250,9 @@ void cql_server::event_notifier::on_leave_cluster(const gms::inet_address& endpo
 {
    for (auto&& conn : _topology_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_topology_change_event(event::topology_change::removed_node(endpoint, conn->_server_addr.port)));
-        });
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_topology_change_event(event::topology_change::removed_node(endpoint, conn->_server_addr.port)));
+        };
    }
 }

@@ -260,9 +260,9 @@ void cql_server::event_notifier::on_move(const gms::inet_address& endpoint)
 {
    for (auto&& conn : _topology_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_topology_change_event(event::topology_change::moved_node(endpoint, conn->_server_addr.port)));
-        });
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_topology_change_event(event::topology_change::moved_node(endpoint, conn->_server_addr.port)));
+        };
    }
 }

@@ -273,9 +273,9 @@ void cql_server::event_notifier::on_up(const gms::inet_address& endpoint)
    if (!was_up) {
        for (auto&& conn : _status_change_listeners) {
            using namespace cql_transport;
-            with_gate(conn->_pending_requests_gate, [&] {
-                return conn->write_response(conn->make_status_change_event(event::status_change::node_up(endpoint, conn->_server_addr.port)));
-            });
+            if (!conn->_pending_requests_gate.is_closed()) {
+                conn->write_response(conn->make_status_change_event(event::status_change::node_up(endpoint, conn->_server_addr.port)));
+            };
        }
    }
 }
@@ -287,9 +287,9 @@ void cql_server::event_notifier::on_down(const gms::inet_address& endpoint)
    if (!was_down) {
        for (auto&& conn : _status_change_listeners) {
            using namespace cql_transport;
-            with_gate(conn->_pending_requests_gate, [&] {
-                return conn->write_response(conn->make_status_change_event(event::status_change::node_down(endpoint, conn->_server_addr.port)));
-            });
+            if (!conn->_pending_requests_gate.is_closed()) {
+                conn->write_response(conn->make_status_change_event(event::status_change::node_down(endpoint, conn->_server_addr.port)));
+            };
        }
    }
 }
--- a/transport/server.cc
+++ b/transport/server.cc
@@ -590,8 +590,8 @@ future<> cql_server::connection::process()
            return write_response(make_error(0, exceptions::exception_code::SERVER_ERROR, "unknown error", tracing::trace_state_ptr()));
        }
    }).finally([this] {
-        _server._notifier->unregister_connection(this);
        return _pending_requests_gate.close().then([this] {
+            _server._notifier->unregister_connection(this);
            return _ready_to_respond.finally([this] {
                return _write_buf.close();
            });
@@ -826,15 +826,14 @@ future<response_type> cql_server::connection::process_prepare(uint16_t stream, b
    return parallel_for_each(cpus.begin(), cpus.end(), [this, query, cpu_id, &cs] (unsigned int c) mutable {
        if (c != cpu_id) {
            return smp::submit_to(c, [this, query, &cs] () mutable {
-                _server._query_processor.local().prepare(query, cs, false);
-                // FIXME: error handling
+                return _server._query_processor.local().prepare(std::move(query), cs, false).discard_result();
            });
        } else {
            return make_ready_future<>();
        }
-    }).then([this, query, stream, &cs] {
+    }).then([this, query, stream, &cs] () mutable {
        tracing::trace(cs.get_trace_state(), "Done preparing on remote shards");
-        return _server._query_processor.local().prepare(query, cs, false).then([this, stream, &cs] (auto msg) {
+        return _server._query_processor.local().prepare(std::move(query), cs, false).then([this, stream, &cs] (auto msg) {
            tracing::trace(cs.get_trace_state(), "Done preparing on a local shard - preparing a result. ID is [{}]", seastar::value_of([&msg] {
                return messages::result_message::prepared::cql::get_id(msg);
            }));
@@ -848,8 +847,9 @@ future<response_type> cql_server::connection::process_prepare(uint16_t stream, b

 future<response_type> cql_server::connection::process_execute(uint16_t stream, bytes_view buf, service::client_state client_state)
 {
-    auto id = read_short_bytes(buf);
-    auto prepared = _server._query_processor.local().get_prepared(id);
+    cql3::prepared_cache_key_type cache_key(read_short_bytes(buf));
+    auto& id = cql3::prepared_cache_key_type::cql_id(cache_key);
+    auto prepared = _server._query_processor.local().get_prepared(cache_key);
    if (!prepared) {
        throw exceptions::prepared_query_not_found_exception(id);
    }
@@ -925,8 +925,9 @@ cql_server::connection::process_batch(uint16_t stream, bytes_view buf, service::
            break;
        }
        case 1: {
-            auto id = read_short_bytes(buf);
-            ps = _server._query_processor.local().get_prepared(id);
+            cql3::prepared_cache_key_type cache_key(read_short_bytes(buf));
+            auto& id = cql3::prepared_cache_key_type::cql_id(cache_key);
+            ps = _server._query_processor.local().get_prepared(cache_key);
            if (!ps) {
                throw exceptions::prepared_query_not_found_exception(id);
            }
--- a/types.cc
+++ b/types.cc
@@ -1963,8 +1963,7 @@ map_type_impl::to_string(const bytes& b) const {

 size_t
 map_type_impl::hash(bytes_view v) const {
-    // FIXME:
-    abort();
+    return std::hash<bytes_view>()(v);
 }

 bytes
@@ -2448,8 +2447,7 @@ set_type_impl::to_string(const bytes& b) const {

 size_t
 set_type_impl::hash(bytes_view v) const {
-    // FIXME:
-    abort();
+    return std::hash<bytes_view>()(v);
 }

 bytes
@@ -2637,8 +2635,7 @@ list_type_impl::to_string(const bytes& b) const {

 size_t
 list_type_impl::hash(bytes_view v) const {
-    // FIXME:
-    abort();
+    return std::hash<bytes_view>()(v);
 }

 bytes
--- a/utils/loading_cache.hh
+++ b/utils/loading_cache.hh
@@ -29,77 +29,54 @@
 #include <seastar/core/timer.hh>
 #include <seastar/core/gate.hh>

-#include "utils/exceptions.hh"
+#include "exceptions/exceptions.hh"
+#include "utils/loading_shared_values.hh"
+#include "log.hh"

 namespace bi = boost::intrusive;

 namespace utils {
-// Simple variant of the "LoadingCache" used for permissions in origin.

-typedef lowres_clock loading_cache_clock_type;
-typedef bi::list_base_hook<bi::link_mode<bi::auto_unlink>> auto_unlink_list_hook;
+using loading_cache_clock_type = seastar::lowres_clock;
+using auto_unlink_list_hook = bi::list_base_hook<bi::link_mode<bi::auto_unlink>>;

-template<typename Tp, typename Key, typename Hash, typename EqualPred>
-class timestamped_val : public auto_unlink_list_hook, public bi::unordered_set_base_hook<bi::store_hash<true>> {
+template<typename Tp, typename Key, typename EntrySize , typename Hash, typename EqualPred, typename LoadingSharedValuesStats>
+class timestamped_val {
 public:
-    typedef bi::list<timestamped_val, bi::constant_time_size<false>> lru_list_type;
-    typedef Key key_type;
-    typedef Tp value_type;
+    using value_type = Tp;
+    using loading_values_type = typename utils::loading_shared_values<Key, timestamped_val, Hash, EqualPred, LoadingSharedValuesStats, 256>;
+    class lru_entry;
+    class value_ptr;

 private:
-    std::experimental::optional<Tp> _opt_value;
+    value_type _value;
    loading_cache_clock_type::time_point _loaded;
    loading_cache_clock_type::time_point _last_read;
-    lru_list_type& _lru_list; /// MRU item is at the front, LRU - at the back
-    Key _key;
+    lru_entry* _lru_entry_ptr = nullptr; /// MRU item is at the front, LRU - at the back
+    size_t _size = 0;

 public:
-    struct key_eq {
-       bool operator()(const Key& k, const timestamped_val& c) const {
-           return EqualPred()(k, c.key());
-       }
-
-       bool operator()(const timestamped_val& c, const Key& k) const {
-           return EqualPred()(c.key(), k);
-       }
-    };
-
-    timestamped_val(lru_list_type& lru_list, const Key& key)
-        : _loaded(loading_cache_clock_type::now())
+    timestamped_val(value_type val)
+        : _value(std::move(val))
+        , _loaded(loading_cache_clock_type::now())
        , _last_read(_loaded)
-        , _lru_list(lru_list)
-        , _key(key) {}
-
-    timestamped_val(lru_list_type& lru_list, Key&& key)
-        : _loaded(loading_cache_clock_type::now())
-        , _last_read(_loaded)
-        , _lru_list(lru_list)
-        , _key(std::move(key)) {}
-
-    timestamped_val(const timestamped_val&) = default;
+        , _size(EntrySize()(_value))
+    {}
    timestamped_val(timestamped_val&&) = default;

-    // Make sure copy/move-assignments don't go through the template below
-    timestamped_val& operator=(const timestamped_val&) = default;
-    timestamped_val& operator=(timestamped_val&) = default;
-    timestamped_val& operator=(timestamped_val&&) = default;
+    timestamped_val& operator=(value_type new_val) {
+        assert(_lru_entry_ptr);

-    template <typename U>
-    timestamped_val& operator=(U&& new_val) {
-        _opt_value = std::forward<U>(new_val);
+        _value = std::move(new_val);
        _loaded = loading_cache_clock_type::now();
+        _lru_entry_ptr->cache_size() -= _size;
+        _size = EntrySize()(_value);
+        _lru_entry_ptr->cache_size() += _size;
        return *this;
    }

-    const Tp& value() {
-        _last_read = loading_cache_clock_type::now();
-        touch();
-        return _opt_value.value();
-    }
-
-    explicit operator bool() const noexcept {
-        return bool(_opt_value);
-    }
+    value_type& value() noexcept { return _value; }
+    const value_type& value() const noexcept { return _value; }

    loading_cache_clock_type::time_point last_read() const noexcept {
        return _last_read;
@@ -109,163 +86,353 @@ public:
        return _loaded;
    }

-    const Key& key() const {
-        return _key;
+    size_t size() const {
+        return _size;
    }

-    friend bool operator==(const timestamped_val& a, const timestamped_val& b){
-        return EqualPred()(a.key(), b.key());
-    }
-
-    friend std::size_t hash_value(const timestamped_val& v) {
-        return Hash()(v.key());
+    bool ready() const noexcept {
+        return _lru_entry_ptr;
    }

 private:
+    void touch() noexcept {
+        assert(_lru_entry_ptr);
+        _last_read = loading_cache_clock_type::now();
+        _lru_entry_ptr->touch();
+    }
+
+    void set_anchor_back_reference(lru_entry* lru_entry_ptr) noexcept {
+        _lru_entry_ptr = lru_entry_ptr;
+    }
+};
+
+template <typename Tp>
+struct simple_entry_size {
+    size_t operator()(const Tp& val) {
+        return 1;
+    }
+};
+
+template<typename Tp, typename Key, typename EntrySize , typename Hash, typename EqualPred, typename LoadingSharedValuesStats>
+class timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>::value_ptr {
+private:
+    using ts_value_type = timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>;
+    using loading_values_type = typename ts_value_type::loading_values_type;
+
+public:
+    using timestamped_val_ptr = typename loading_values_type::entry_ptr;
+    using value_type = Tp;
+
+private:
+    timestamped_val_ptr _ts_val_ptr;
+
+public:
+    value_ptr(timestamped_val_ptr ts_val_ptr) : _ts_val_ptr(std::move(ts_val_ptr)) { _ts_val_ptr->touch(); }
+    explicit operator bool() const noexcept { return bool(_ts_val_ptr); }
+    value_type& operator*() const noexcept { return _ts_val_ptr->value(); }
+    value_type* operator->() const noexcept { return &_ts_val_ptr->value(); }
+};
+
+/// \brief This is and LRU list entry which is also an anchor for a loading_cache value.
+template<typename Tp, typename Key, typename EntrySize , typename Hash, typename EqualPred, typename LoadingSharedValuesStats>
+class timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>::lru_entry : public auto_unlink_list_hook {
+private:
+    using ts_value_type = timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>;
+    using loading_values_type = typename ts_value_type::loading_values_type;
+
+public:
+    using lru_list_type = bi::list<lru_entry, bi::constant_time_size<false>>;
+    using timestamped_val_ptr = typename loading_values_type::entry_ptr;
+
+private:
+    timestamped_val_ptr _ts_val_ptr;
+    lru_list_type& _lru_list;
+    size_t& _cache_size;
+
+public:
+    lru_entry(timestamped_val_ptr ts_val, lru_list_type& lru_list, size_t& cache_size)
+        : _ts_val_ptr(std::move(ts_val))
+        , _lru_list(lru_list)
+        , _cache_size(cache_size)
+    {
+        _ts_val_ptr->set_anchor_back_reference(this);
+        _cache_size += _ts_val_ptr->size();
+    }
+
+    ~lru_entry() {
+        _cache_size -= _ts_val_ptr->size();
+        _ts_val_ptr->set_anchor_back_reference(nullptr);
+    }
+
+    size_t& cache_size() noexcept {
+        return _cache_size;
+    }
+
    /// Set this item as the most recently used item.
    /// The MRU item is going to be at the front of the _lru_list, the LRU item - at the back.
    void touch() noexcept {
        auto_unlink_list_hook::unlink();
        _lru_list.push_front(*this);
    }
-};

-class shared_mutex {
-private:
-    lw_shared_ptr<semaphore> _mutex_ptr;
-
-public:
-    shared_mutex() : _mutex_ptr(make_lw_shared<semaphore>(1)) {}
-    semaphore& get() const noexcept {
-        return *_mutex_ptr;
+    const Key& key() const noexcept {
+        return loading_values_type::to_key(_ts_val_ptr);
    }
+
+    timestamped_val& timestamped_value() noexcept { return *_ts_val_ptr; }
+    const timestamped_val& timestamped_value() const noexcept { return *_ts_val_ptr; }
+    timestamped_val_ptr timestamped_value_ptr() noexcept { return _ts_val_ptr; }
 };

+enum class loading_cache_reload_enabled { no, yes };
+
+/// \brief Loading cache is a cache that loads the value into the cache using the given asynchronous callback.
+///
+/// Each cached value if reloading is enabled (\tparam ReloadEnabled == loading_cache_reload_enabled::yes) is reloaded after
+/// the "refresh" time period since it was loaded for the last time.
+///
+/// The values are going to be evicted from the cache if they are not accessed during the "expiration" period or haven't
+/// been reloaded even once during the same period.
+///
+/// If "expiration" is set to zero - the caching is going to be disabled and get_XXX(...) is going to call the "loader" callback
+/// every time in order to get the requested value.
+///
+/// \note In order to avoid the eviction of cached entries due to "aging" of the contained value the user has to choose
+/// the "expiration" to be at least ("refresh" + "max load latency"). This way the value is going to stay in the cache and is going to be
+/// read in a non-blocking way as long as it's frequently accessed. Note however that since reloading is an asynchronous
+/// procedure it may get delayed by other running task. Therefore choosing the "expiration" too close to the ("refresh" + "max load latency")
+/// value one risks to have his/her cache values evicted when the system is heavily loaded.
+///
+/// The cache is also limited in size and if adding the next value is going
+/// to exceed the cache size limit the least recently used value(s) is(are) going to be evicted until the size of the cache
+/// becomes such that adding the new value is not going to break the size limit. If the new entry's size is greater than
+/// the cache size then the get_XXX(...) method is going to return a future with the loading_cache::entry_is_too_big exception.
+///
+/// The size of the cache is defined as a sum of sizes of all cached entries.
+/// The size of each entry is defined by the value returned by the \tparam EntrySize predicate applied on it.
+///
+/// The get(key) or get_ptr(key) methods ensures that the "loader" callback is called only once for each cached entry regardless of how many
+/// callers are calling for the get_XXX(key) for the same "key" at the same time. Only after the value is evicted from the cache
+/// it's going to be "loaded" in the context of get_XXX(key). As long as the value is cached get_XXX(key) is going to return the
+/// cached value immediately and reload it in the background every "refresh" time period as described above.
+///
+/// \tparam Key type of the cache key
+/// \tparam Tp type of the cached value
+/// \tparam ReloadEnabled if loading_cache_reload_enabled::yes allow reloading the values otherwise don't reload
+/// \tparam EntrySize predicate to calculate the entry size
+/// \tparam Hash hash function
+/// \tparam EqualPred equality predicate
+/// \tparam LoadingSharedValuesStats statistics incrementing class (see utils::loading_shared_values)
+/// \tparam Alloc elements allocator
 template<typename Key,
         typename Tp,
+         loading_cache_reload_enabled ReloadEnabled = loading_cache_reload_enabled::no,
+         typename EntrySize = simple_entry_size<Tp>,
         typename Hash = std::hash<Key>,
         typename EqualPred = std::equal_to<Key>,
-         typename Alloc = std::allocator<timestamped_val<Tp, Key, Hash, EqualPred>>,
-         typename SharedMutexMapAlloc = std::allocator<std::pair<const Key, shared_mutex>>>
+         typename LoadingSharedValuesStats = utils::do_nothing_loading_shared_values_stats,
+         typename Alloc = std::allocator<typename timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>::lru_entry>>
 class loading_cache {
 private:
-    typedef timestamped_val<Tp, Key, Hash, EqualPred> ts_value_type;
-    typedef bi::unordered_set<ts_value_type, bi::power_2_buckets<true>, bi::compare_hash<true>> set_type;
-    typedef std::unordered_map<Key, shared_mutex, Hash, EqualPred, SharedMutexMapAlloc> write_mutex_map_type;
-    typedef typename ts_value_type::lru_list_type lru_list_type;
-    typedef typename set_type::bucket_traits bi_set_bucket_traits;
-
-    static constexpr int initial_num_buckets = 256;
-    static constexpr int max_num_buckets = 1024 * 1024;
+    using ts_value_type = timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>;
+    using loading_values_type = typename ts_value_type::loading_values_type;
+    using timestamped_val_ptr = typename loading_values_type::entry_ptr;
+    using ts_value_lru_entry = typename ts_value_type::lru_entry;
+    using set_iterator = typename loading_values_type::iterator;
+    using lru_list_type = typename ts_value_lru_entry::lru_list_type;
+    struct value_extractor_fn {
+        Tp& operator()(ts_value_type& tv) const {
+            return tv.value();
+        }
+    };

 public:
-    typedef Tp value_type;
-    typedef Key key_type;
-    typedef typename set_type::iterator iterator;
+    using value_type = Tp;
+    using key_type = Key;
+    using value_ptr = typename ts_value_type::value_ptr;

+    class entry_is_too_big : public std::exception {};
+    using iterator = boost::transform_iterator<value_extractor_fn, set_iterator>;
+
+private:
+    loading_cache(size_t max_size, std::chrono::milliseconds expiry, std::chrono::milliseconds refresh, logging::logger& logger)
+        : _max_size(max_size)
+        , _expiry(expiry)
+        , _refresh(refresh)
+        , _logger(logger)
+        , _timer([this] { on_timer(); })
+    {
+        // Sanity check: if expiration period is given then non-zero refresh period and maximal size are required
+        if (caching_enabled() && (_refresh == std::chrono::milliseconds(0) || _max_size == 0)) {
+            throw exceptions::configuration_exception("loading_cache: caching is enabled but refresh period and/or max_size are zero");
+        }
+    }
+
+public:
    template<typename Func>
    loading_cache(size_t max_size, std::chrono::milliseconds expiry, std::chrono::milliseconds refresh, logging::logger& logger, Func&& load)
-                : _buckets(initial_num_buckets)
-                , _set(bi_set_bucket_traits(_buckets.data(), _buckets.size()))
-                , _max_size(max_size)
-                , _expiry(expiry)
-                , _refresh(refresh)
-                , _logger(logger)
-                , _load(std::forward<Func>(load)) {
+        : loading_cache(max_size, expiry, refresh, logger)
+    {
+        static_assert(ReloadEnabled == loading_cache_reload_enabled::yes, "This constructor should only be invoked when ReloadEnabled == loading_cache_reload_enabled::yes");
+        static_assert(std::is_same<future<value_type>, std::result_of_t<Func(const key_type&)>>::value, "Bad Func signature");
+
+        _load = std::forward<Func>(load);

        // If expiration period is zero - caching is disabled
        if (!caching_enabled()) {
            return;
        }

-        // Sanity check: if expiration period is given then non-zero refresh period and maximal size are required
-        if (_refresh == std::chrono::milliseconds(0) || _max_size == 0) {
-            throw exceptions::configuration_exception("loading_cache: caching is enabled but refresh period and/or max_size are zero");
+        _timer_period = std::min(_expiry, _refresh);
+        _timer.arm(_timer_period);
+    }
+
+    loading_cache(size_t max_size, std::chrono::milliseconds expiry, logging::logger& logger)
+        : loading_cache(max_size, expiry, loading_cache_clock_type::time_point::max().time_since_epoch(), logger)
+    {
+        static_assert(ReloadEnabled == loading_cache_reload_enabled::no, "This constructor should only be invoked when ReloadEnabled == loading_cache_reload_enabled::no");
+
+        // If expiration period is zero - caching is disabled
+        if (!caching_enabled()) {
+            return;
        }

-        _timer.set_callback([this] { on_timer(); });
-        _timer.arm(_refresh);
+        _timer_period = _expiry;
+        _timer.arm(_timer_period);
    }

    ~loading_cache() {
-        _set.clear_and_dispose([] (ts_value_type* ptr) { loading_cache::destroy_ts_value(ptr); });
+        _lru_list.erase_and_dispose(_lru_list.begin(), _lru_list.end(), [] (ts_value_lru_entry* ptr) { loading_cache::destroy_ts_value(ptr); });
+    }
+
+    template <typename LoadFunc>
+    future<value_ptr> get_ptr(const Key& k, LoadFunc&& load) {
+        static_assert(std::is_same<future<value_type>, std::result_of_t<LoadFunc(const key_type&)>>::value, "Bad LoadFunc signature");
+        // We shouldn't be here if caching is disabled
+        assert(caching_enabled());
+
+        return _loading_values.get_or_load(k, [this, load = std::forward<LoadFunc>(load)] (const Key& k) mutable {
+            return load(k).then([this] (value_type val) {
+                return ts_value_type(std::move(val));
+            });
+        }).then([this, k] (timestamped_val_ptr ts_val_ptr) {
+            // check again since it could have already been inserted and initialized
+            if (!ts_val_ptr->ready()) {
+                _logger.trace("{}: storing the value for the first time", k);
+
+                if (ts_val_ptr->size() > _max_size) {
+                    return make_exception_future<value_ptr>(entry_is_too_big());
+                }
+
+                ts_value_lru_entry* new_lru_entry = Alloc().allocate(1);
+                new(new_lru_entry) ts_value_lru_entry(std::move(ts_val_ptr), _lru_list, _current_size);
+
+                // This will "touch" the entry and add it to the LRU list - we must do this before the shrink() call.
+                value_ptr vp(new_lru_entry->timestamped_value_ptr());
+
+                // Remove the least recently used items if map is too big.
+                shrink();
+
+                return make_ready_future<value_ptr>(std::move(vp));
+            }
+
+            return make_ready_future<value_ptr>(std::move(ts_val_ptr));
+        });
+    }
+
+    future<value_ptr> get_ptr(const Key& k) {
+        static_assert(ReloadEnabled == loading_cache_reload_enabled::yes, "reload must be enabled");
+        return get_ptr(k, _load);
    }

    future<Tp> get(const Key& k) {
+        static_assert(ReloadEnabled == loading_cache_reload_enabled::yes, "reload must be enabled");
+
        // If caching is disabled - always load in the foreground
        if (!caching_enabled()) {
-            return _load(k);
+            return _load(k).then([] (Tp val) {
+                return make_ready_future<Tp>(std::move(val));
+            });
        }

-        // If the key is not in the cache yet, then find_or_create() is going to
-        // create a new uninitialized value in the map. If the value is already
-        // in the cache (the fast path) simply return the value. Otherwise, take
-        // the mutex and try to load the value (the slow path).
-        iterator ts_value_it = find_or_create(k);
-        if (*ts_value_it) {
-            return make_ready_future<Tp>(ts_value_it->value());
-        } else {
-            return slow_load(k);
-        }
+        return get_ptr(k).then([] (value_ptr v_ptr) {
+            return make_ready_future<Tp>(*v_ptr);
+        });
    }

    future<> stop() {
        return _timer_reads_gate.close().finally([this] { _timer.cancel(); });
    }

+    iterator find(const Key& k) noexcept {
+        return boost::make_transform_iterator(set_find(k), _value_extractor_fn);
+    }
+
+    iterator end() {
+        return boost::make_transform_iterator(_loading_values.end(), _value_extractor_fn);
+    }
+
+    iterator begin() {
+        return boost::make_transform_iterator(_loading_values.begin(), _value_extractor_fn);
+    }
+
+    template <typename Pred>
+    void remove_if(Pred&& pred) {
+        static_assert(std::is_same<bool, std::result_of_t<Pred(const value_type&)>>::value, "Bad Pred signature");
+
+        _lru_list.remove_and_dispose_if([this, &pred] (const ts_value_lru_entry& v) {
+            return pred(v.timestamped_value().value());
+        }, [this] (ts_value_lru_entry* p) {
+            loading_cache::destroy_ts_value(p);
+        });
+    }
+
+    size_t size() const {
+        return _loading_values.size();
+    }
+
+    /// \brief returns the memory size the currently cached entries occupy according to the EntrySize predicate.
+    size_t memory_footprint() const {
+        return _current_size;
+    }
+
 private:
+    set_iterator set_find(const Key& k) noexcept {
+        set_iterator it = _loading_values.find(k);
+        set_iterator end_it = set_end();
+
+        if (it == end_it || !it->ready()) {
+            return end_it;
+        }
+        return it;
+    }
+
+    set_iterator set_end() noexcept {
+        return _loading_values.end();
+    }
+
+    set_iterator set_begin() noexcept {
+        return _loading_values.begin();
+    }
+
    bool caching_enabled() const {
        return _expiry != std::chrono::milliseconds(0);
    }

-    /// Look for the entry with the given key. It it doesn't exist - create a new one and add it to the _set.
-    ///
-    /// \param k The key to look for
-    ///
-    /// \return An iterator to the value with the given key (always dirrerent from _set.end())
-    template <typename KeyType>
-    iterator find_or_create(KeyType&& k) {
-        iterator i = _set.find(k, Hash(), typename ts_value_type::key_eq());
-        if (i == _set.end()) {
-            ts_value_type* new_ts_val = Alloc().allocate(1);
-            new(new_ts_val) ts_value_type(_lru_list, std::forward<KeyType>(k));
-            auto p = _set.insert(*new_ts_val);
-            i = p.first;
-        }
-
-        return i;
-    }
-
-    static void destroy_ts_value(ts_value_type* val) {
-        val->~ts_value_type();
+    static void destroy_ts_value(ts_value_lru_entry* val) {
+        val->~ts_value_lru_entry();
        Alloc().deallocate(val, 1);
    }

-    future<Tp> slow_load(const Key& k) {
-        // If the key is not in the cache yet, then _write_mutex_map[k] is going
-        // to create a new value with the initialized mutex. The mutex is going
-        // to serialize the producers and only the first one is going to
-        // actually issue a load operation and initialize the value with the
-        // received result. The rest are going to see (and read) the initialized
-        // value when they enter the critical section.
-        shared_mutex sm = _write_mutex_map[k];
-        return with_semaphore(sm.get(), 1, [this, k] {
-            iterator ts_value_it = find_or_create(k);
-            if (*ts_value_it) {
-                return make_ready_future<Tp>(ts_value_it->value());
+    future<> reload(ts_value_lru_entry& lru_entry) {
+        return _load(lru_entry.key()).then_wrapped([this, key = lru_entry.key()] (auto&& f) mutable {
+            // if the entry has been evicted by now - simply end here
+            set_iterator it = this->set_find(key);
+            if (it == this->set_end()) {
+                this->_logger.trace("{}: entry was dropped during the reload", key);
+                return make_ready_future<>();
            }
-            _logger.trace("{}: storing the value for the first time", k);
-            return _load(k).then([this, k] (Tp t) {
-                // we have to "re-read" the _set here because the value may have been evicted by now
-                iterator ts_value_it = find_or_create(std::move(k));
-                *ts_value_it = std::move(t);
-                return make_ready_future<Tp>(ts_value_it->value());
-            });
-        }).finally([sm] {});
-    }

-    future<> reload(ts_value_type& ts_val) {
-        return _load(ts_val.key()).then_wrapped([this, &ts_val] (auto&& f) {
            // The exceptions are related to the load operation itself.
            // We should ignore them for the background reads - if
            // they persist the value will age and will be reloaded in
@@ -273,120 +440,97 @@ private:
            // will be propagated up to the user and will fail the
            // corresponding query.
            try {
-                ts_val = f.get0();
+                *it = f.get0();
            } catch (std::exception& e) {
-                _logger.debug("{}: reload failed: {}", ts_val.key(), e.what());
+                this->_logger.debug("{}: reload failed: {}", key, e.what());
            } catch (...) {
-                _logger.debug("{}: reload failed: unknown error", ts_val.key());
+                this->_logger.debug("{}: reload failed: unknown error", key);
            }
-        });
-    }

-    void erase(iterator it) {
-        _set.erase_and_dispose(it, [] (ts_value_type* ptr) { loading_cache::destroy_ts_value(ptr); });
-        // no need to delete the item from _lru_list - it's auto-deleted
+            return make_ready_future<>();
+        });
    }

    void drop_expired() {
        auto now = loading_cache_clock_type::now();
-        _lru_list.remove_and_dispose_if([now, this] (const ts_value_type& v) {
+        _lru_list.remove_and_dispose_if([now, this] (const ts_value_lru_entry& lru_entry) {
            using namespace std::chrono;
            // An entry should be discarded if it hasn't been reloaded for too long or nobody cares about it anymore
+            const ts_value_type& v = lru_entry.timestamped_value();
            auto since_last_read = now - v.last_read();
            auto since_loaded = now - v.loaded();
-            if (_expiry < since_last_read || _expiry < since_loaded) {
-                _logger.trace("drop_expired(): {}: dropping the entry: _expiry {},  ms passed since: loaded {} last_read {}", v.key(), _expiry.count(), duration_cast<milliseconds>(since_loaded).count(), duration_cast<milliseconds>(since_last_read).count());
+            if (_expiry < since_last_read || (ReloadEnabled == loading_cache_reload_enabled::yes && _expiry < since_loaded)) {
+                _logger.trace("drop_expired(): {}: dropping the entry: _expiry {},  ms passed since: loaded {} last_read {}", lru_entry.key(), _expiry.count(), duration_cast<milliseconds>(since_loaded).count(), duration_cast<milliseconds>(since_last_read).count());
                return true;
            }
            return false;
-        }, [this] (ts_value_type* p) {
-            erase(_set.iterator_to(*p));
+        }, [this] (ts_value_lru_entry* p) {
+            loading_cache::destroy_ts_value(p);
        });
    }

    // Shrink the cache to the _max_size discarding the least recently used items
    void shrink() {
-        if (_set.size() > _max_size) {
-            auto num_items_to_erase = _set.size() - _max_size;
-            for (size_t i = 0; i < num_items_to_erase; ++i) {
-                using namespace std::chrono;
-                ts_value_type& ts_val = *_lru_list.rbegin();
-                _logger.trace("shrink(): {}: dropping the entry: ms since last_read {}", ts_val.key(), duration_cast<milliseconds>(loading_cache_clock_type::now() - ts_val.last_read()).count());
-                erase(_set.iterator_to(ts_val));
-            }
+        while (_current_size > _max_size) {
+            using namespace std::chrono;
+            ts_value_lru_entry& lru_entry = *_lru_list.rbegin();
+            _logger.trace("shrink(): {}: dropping the entry: ms since last_read {}", lru_entry.key(), duration_cast<milliseconds>(loading_cache_clock_type::now() - lru_entry.timestamped_value().last_read()).count());
+            loading_cache::destroy_ts_value(&lru_entry);
        }
    }

-    void rehash() {
-        size_t new_buckets_count = 0;
-
-        // Don't grow or shrink too fast even if there is a steep drop/growth in the number of elements in the set.
-        // Exponential growth/backoff should be good enough.
-        //
-        // Try to keep the load factor between 0.25 and 1.0.
-        if (_set.size() < _current_buckets_count / 4) {
-            new_buckets_count = _current_buckets_count / 4;
-        } else if (_set.size() > _current_buckets_count) {
-            new_buckets_count = _current_buckets_count * 2;
+    // Try to bring the load factors of the _loading_values into a known range.
+    void periodic_rehash() noexcept {
+        try {
+            _loading_values.rehash();
+        } catch (...) {
+            // if rehashing fails - continue with the current buckets array
        }
-
-        if (new_buckets_count < initial_num_buckets || new_buckets_count > max_num_buckets) {
-            return;
-        }
-
-        std::vector<typename set_type::bucket_type> new_buckets(new_buckets_count);
-        _set.rehash(bi_set_bucket_traits(new_buckets.data(), new_buckets.size()));
-        _logger.trace("rehash(): buckets count changed: {} -> {}", _current_buckets_count, new_buckets_count);
-
-        _buckets.swap(new_buckets);
-        _current_buckets_count = new_buckets_count;
    }

    void on_timer() {
        _logger.trace("on_timer(): start");

-        auto timer_start_tp = loading_cache_clock_type::now();
-
-        // Clear all cached mutexes
-        _write_mutex_map.clear();
-
        // Clean up items that were not touched for the whole _expiry period.
        drop_expired();

-        // Remove the least recently used items if map is too big.
-        shrink();
-
        // check if rehashing is needed and do it if it is.
-        rehash();
+        periodic_rehash();
+
+        if (ReloadEnabled == loading_cache_reload_enabled::no) {
+            _logger.trace("on_timer(): rearming");
+            _timer.arm(loading_cache_clock_type::now() + _timer_period);
+            return;
+        }

        // Reload all those which vlaue needs to be reloaded.
-        with_gate(_timer_reads_gate, [this, timer_start_tp] {
-            return parallel_for_each(_set.begin(), _set.end(), [this, curr_time = timer_start_tp] (auto& ts_val) {
-                _logger.trace("on_timer(): {}: checking the value age", ts_val.key());
-                if (ts_val && ts_val.loaded() + _refresh < curr_time) {
-                    _logger.trace("on_timer(): {}: reloading the value", ts_val.key());
-                    return this->reload(ts_val);
+        with_gate(_timer_reads_gate, [this] {
+            return parallel_for_each(_lru_list.begin(), _lru_list.end(), [this] (ts_value_lru_entry& lru_entry) {
+                _logger.trace("on_timer(): {}: checking the value age", lru_entry.key());
+                if (lru_entry.timestamped_value().loaded() + _refresh < loading_cache_clock_type::now()) {
+                    _logger.trace("on_timer(): {}: reloading the value", lru_entry.key());
+                    return this->reload(lru_entry);
                }
                return now();
-            }).finally([this, timer_start_tp] {
+            }).finally([this] {
                _logger.trace("on_timer(): rearming");
-                _timer.arm(timer_start_tp + _refresh);
+                _timer.arm(loading_cache_clock_type::now() + _timer_period);
            });
        });
    }

-    std::vector<typename set_type::bucket_type> _buckets;
-    size_t _current_buckets_count = initial_num_buckets;
-    set_type _set;
-    write_mutex_map_type _write_mutex_map;
+    loading_values_type _loading_values;
    lru_list_type _lru_list;
-    size_t _max_size;
+    size_t _current_size = 0;
+    size_t _max_size = 0;
    std::chrono::milliseconds _expiry;
    std::chrono::milliseconds _refresh;
+    loading_cache_clock_type::duration _timer_period;
    logging::logger& _logger;
    std::function<future<Tp>(const Key&)> _load;
    timer<lowres_clock> _timer;
    seastar::gate _timer_reads_gate;
+    value_extractor_fn _value_extractor_fn;
 };

 }
--- a/utils/loading_shared_values.hh
+++ b/utils/loading_shared_values.hh
@@ -137,7 +137,11 @@ private:
    using set_type = bi::unordered_set<entry, bi::power_2_buckets<true>, bi::compare_hash<true>>;
    using bi_set_bucket_traits = typename set_type::bucket_traits;
    using set_iterator = typename set_type::iterator;
-    using value_extractor_fn = std::function<value_type& (entry&)>;
+    struct value_extractor_fn {
+        value_type& operator()(entry& e) const {
+            return e.value();
+        }
+    };
    enum class shrinking_is_allowed { no, yes };

 public:
@@ -186,7 +190,6 @@ public:
    loading_shared_values()
        : _buckets(InitialBucketsCount)
        , _set(bi_set_bucket_traits(_buckets.data(), _buckets.size()))
-        , _value_extractor_fn([] (entry& e) -> value_type& { return e.value(); })
    {
        static_assert(noexcept(Stats::inc_evictions()), "Stats::inc_evictions must be non-throwing");
        static_assert(noexcept(Stats::inc_hits()), "Stats::inc_hits must be non-throwing");
--- a/utils/logalloc.cc
+++ b/utils/logalloc.cc
@@ -117,6 +117,7 @@ public:
    void reclaim_all_free_segments();
    occupancy_stats region_occupancy();
    occupancy_stats occupancy();
+    size_t non_lsa_used_space();
    void set_reclamation_step(size_t step_in_segments) { _reclamation_step = step_in_segments; }
    size_t reclamation_step() const { return _reclamation_step; }
    void enable_abort_on_bad_alloc() { _abort_on_bad_alloc = true; }
@@ -153,6 +154,10 @@ occupancy_stats tracker::occupancy() {
    return _impl->occupancy();
 }

+size_t tracker::non_lsa_used_space() const {
+    return _impl->non_lsa_used_space();
+}
+
 void tracker::full_compaction() {
    return _impl->full_compaction();
 }
@@ -291,7 +296,7 @@ static inline bool can_allocate_more_memory(size_t size)
 class segment_zone : public bi::set_base_hook<>, public bi::slist_base_hook<> {
    struct free_segment : public bi::slist_base_hook<> { };

-    static constexpr size_t maximum_size = 256;
+    static constexpr size_t maximum_size = max_zone_segments;
    static constexpr size_t minimum_size = 16;
    static thread_local size_t next_attempt_size;

@@ -574,10 +579,8 @@ size_t segment_pool::reclaim_segments(size_t target) {
    bi::slist<segment_zone> zones_to_remove;
    for (auto& zone : _all_zones | boost::adaptors::reversed) {
        if (zone.empty()) {
-            if (reclaimed_segments < target || !zone.free_segment_count()) {
-                reclaimed_segments += zone.free_segment_count();
-                zones_to_remove.push_front(zone);
-            }
+            reclaimed_segments += zone.free_segment_count();
+            zones_to_remove.push_front(zone);
        } else if (zone.free_segment_count()) {
            _free_segments_in_zones += zone.free_segment_count();
            zone.rebuild_free_segments_list();
@@ -1681,6 +1684,11 @@ occupancy_stats tracker::impl::occupancy() {
    return occ;
 }

+size_t tracker::impl::non_lsa_used_space() {
+    auto free_space_in_zones = shard_segment_pool.free_segments_in_zones() * segment_size;
+    return memory::stats().allocated_memory() - region_occupancy().total_space() - free_space_in_zones;
+}
+
 void tracker::impl::reclaim_all_free_segments()
 {
    llogger.debug("Reclaiming all free segments");
@@ -2013,11 +2021,8 @@ tracker::impl::impl() {
        sm::make_gauge("large_objects_total_space_bytes", [this] { return shard_segment_pool.non_lsa_memory_in_use(); },
                       sm::description("Holds a current size of allocated non-LSA memory.")),

-        sm::make_gauge("non_lsa_used_space_bytes",
-            [this] {
-                auto free_space_in_zones = shard_segment_pool.free_segments_in_zones() * segment_size;
-                return memory::stats().allocated_memory() - region_occupancy().total_space() - free_space_in_zones;
-            }, sm::description("Holds a current amount of used non-LSA memory.")),
+        sm::make_gauge("non_lsa_used_space_bytes", [this] { return non_lsa_used_space(); },
+                       sm::description("Holds a current amount of used non-LSA memory.")),

        sm::make_gauge("free_space_in_zones", [this] { return shard_segment_pool.free_segments_in_zones() * segment_size; },
                       sm::description("Holds a current amount of free memory in zones.")),
--- a/utils/logalloc.hh
+++ b/utils/logalloc.hh
@@ -43,6 +43,7 @@ class allocating_section;

 constexpr int segment_size_shift = 18; // 256K; see #151, #152
 constexpr size_t segment_size = 1 << segment_size_shift;
+constexpr size_t max_zone_segments = 256;

 //
 // Frees some amount of objects from the region to which it's attached.
@@ -455,6 +456,9 @@ public:
    // Returns statistics for all segments allocated by LSA on this shard.
    occupancy_stats occupancy();

+    // Returns amount of allocated memory not managed by LSA
+    size_t non_lsa_used_space() const;
+
    impl& get_impl() { return *_impl; }

    // Set the minimum number of segments reclaimed during single reclamation cycle.
				`@@ -1 +0,0 @@`
				`options raid0 devices_discard_performance=Y`