mutation_query_test: add test for result size calculation

Check that digest only and digest+data query calculate result size to be the same. Message-Id: <20180906153800.GK2326@scylladb.com> (cherry picked from commit 9e438933a2)
mutation_partition: accurately account for result size in digest only queries
2026-04-26 19:35:12 +00:00 · 2018-09-08 18:55:23 +03:00 · 2018-09-08 18:55:23 +03:00 · 2018-09-06 16:51:31 +03:00 · 2018-08-26 15:52:18 +03:00 · 2018-08-21 17:37:36 +01:00
63 changed files with 1137 additions and 447 deletions
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=2.1.0
+VERSION=2.1.6

 if test -f version
 then
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -141,7 +141,9 @@ static sstring gensalt() {
    // blowfish 2011 fix, blowfish, sha512, sha256, md5
    for (sstring pfx : { "$2y$", "$2a$", "$6$", "$5$", "$1$" }) {
        salt = pfx + input;
-        if (crypt_r("fisk", salt.c_str(), &tlcrypt)) {
+        const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
+
+        if (e && (e[0] != '*')) {
            prefix = pfx;
            return salt;
        }
--- a/auth/permissions_cache.hh
+++ b/auth/permissions_cache.hh
@@ -89,10 +89,6 @@ class permissions_cache final {
 public:
    explicit permissions_cache(const permissions_cache_config&, service&, logging::logger&);

-    future<> start() {
-        return make_ready_future<>();
-    }
-
    future <> stop() {
        return _cache.stop();
    }
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -24,7 +24,6 @@
 #include <map>

 #include <seastar/core/future-util.hh>
-#include <seastar/core/sharded.hh>
 #include <seastar/core/shared_ptr.hh>

 #include "auth/allow_all_authenticator.hh"
@@ -86,8 +85,6 @@ private:
    void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
 };

-static sharded<permissions_cache> sharded_permissions_cache{};
-
 static db::consistency_level consistency_for_user(const sstring& name) {
    if (name == meta::DEFAULT_SUPERUSER_NAME) {
        return db::consistency_level::QUORUM;
@@ -130,7 +127,8 @@ service::service(
        ::service::migration_manager& mm,
        std::unique_ptr<authorizer> a,
        std::unique_ptr<authenticator> b)
-            : _cache_config(std::move(c))
+            : _permissions_cache_config(std::move(c))
+            , _permissions_cache(nullptr)
            , _qp(qp)
            , _migration_manager(mm)
            , _authorizer(std::move(a))
@@ -240,10 +238,12 @@ future<> service::start() {
        return make_ready_future<>();
    }).then([this] {
        return when_all_succeed(_authorizer->start(), _authenticator->start());
+    }).then([this] {
+        _permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
    }).then([this] {
        return once_among_shards([this] {
            _migration_manager.register_listener(_migration_listener.get());
-            return sharded_permissions_cache.start(std::ref(_cache_config), std::ref(*this), std::ref(log));
+            return make_ready_future<>();
        });
    });
 }
@@ -251,7 +251,9 @@ future<> service::start() {
 future<> service::stop() {
    return once_among_shards([this] {
        _delayed.cancel_all();
-        return sharded_permissions_cache.stop();
+        return make_ready_future<>();
+    }).then([this] {
+        return _permissions_cache->stop();
    }).then([this] {
        return when_all_succeed(_authorizer->stop(), _authenticator->stop());
    });
@@ -335,7 +337,7 @@ future<> service::delete_user(const sstring& name) {
 }

 future<permission_set> service::get_permissions(::shared_ptr<authenticated_user> u, data_resource r) const {
-    return sharded_permissions_cache.local().get(std::move(u), std::move(r));
+    return _permissions_cache->get(std::move(u), std::move(r));
 }

 //
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -60,7 +60,8 @@ struct service_config final {
 };

 class service final {
-    permissions_cache_config _cache_config;
+    permissions_cache_config _permissions_cache_config;
+    std::unique_ptr<permissions_cache> _permissions_cache;

    cql3::query_processor& _qp;

--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -60,6 +60,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
        // - _next_row_in_range = _next.position() < _upper_bound
        // - _last_row points at a direct predecessor of the next row which is going to be read.
        //   Used for populating continuity.
+        // - _population_range_starts_before_all_rows is set accordingly
        reading_from_underlying,

        end_of_stream
@@ -96,6 +97,13 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
    partition_snapshot_row_cursor _next_row;
    bool _next_row_in_range = false;

+    // True iff current population interval, since the previous clustering row, starts before all clustered rows.
+    // We cannot just look at _lower_bound, because emission of range tombstones changes _lower_bound and
+    // because we mark clustering intervals as continuous when consuming a clustering_row, it would prevent
+    // us from marking the interval as continuous.
+    // Valid when _state == reading_from_underlying.
+    bool _population_range_starts_before_all_rows;
+
    future<> do_fill_buffer();
    void copy_from_cache_to_buffer();
    future<> process_static_row();
@@ -225,6 +233,7 @@ inline
 future<> cache_flat_mutation_reader::do_fill_buffer() {
    if (_state == state::move_to_underlying) {
        _state = state::reading_from_underlying;
+        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}).then([this] {
@@ -348,7 +357,7 @@ future<> cache_flat_mutation_reader::read_from_underlying() {

 inline
 void cache_flat_mutation_reader::maybe_update_continuity() {
-    if (can_populate() && (!_ck_ranges_curr->start() || _last_row.refresh(*_snp))) {
+    if (can_populate() && (_population_range_starts_before_all_rows || _last_row.refresh(*_snp))) {
            if (_next_row.is_in_latest_version()) {
                clogger.trace("csm {}: mark {} continuous", this, _next_row.get_iterator_in_latest_version()->position());
                _next_row.get_iterator_in_latest_version()->set_continuous(true);
@@ -387,6 +396,7 @@ inline
 void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
    if (!can_populate()) {
        _last_row = nullptr;
+        _population_range_starts_before_all_rows = false;
        _read_context->cache().on_mispopulate();
        return;
    }
@@ -417,6 +427,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
        with_allocator(standard_allocator(), [&] {
            _last_row = partition_snapshot_row_weakref(*_snp, it);
        });
+        _population_range_starts_before_all_rows = false;
    });
 }

--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -70,7 +70,7 @@ public:
    {
        if (!with_static_row) {
            if (_current == _end) {
-                _current_start = _current_end = position_in_partition_view::after_all_clustered_rows();
+                _current_start = position_in_partition_view::before_all_clustered_rows();
            } else {
                _current_start = position_in_partition_view::for_range_start(*_current);
                _current_end = position_in_partition_view::for_range_end(*_current);
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -209,19 +209,18 @@ void query_options::prepare(const std::vector<::shared_ptr<column_specification>
    }

    auto& names = *_names;
-    std::vector<cql3::raw_value> ordered_values;
+    std::vector<cql3::raw_value_view> ordered_values;
    ordered_values.reserve(specs.size());
    for (auto&& spec : specs) {
        auto& spec_name = spec->name->text();
        for (size_t j = 0; j < names.size(); j++) {
            if (names[j] == spec_name) {
-                ordered_values.emplace_back(_values[j]);
+                ordered_values.emplace_back(_value_views[j]);
                break;
            }
        }
    }
-    _values = std::move(ordered_values);
-    fill_value_views();
+    _value_views = std::move(ordered_values);
 }

 void query_options::fill_value_views()
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -606,6 +606,7 @@ void query_processor::migration_subscriber::on_drop_aggregate(const sstring& ks_
 }

 void query_processor::migration_subscriber::on_drop_view(const sstring& ks_name, const sstring& view_name) {
+    remove_invalid_prepared_statements(ks_name, view_name);
 }

 void query_processor::migration_subscriber::remove_invalid_prepared_statements(
--- a/cql3/restrictions/single_column_restriction.hh
+++ b/cql3/restrictions/single_column_restriction.hh
@@ -202,6 +202,14 @@ public:
                                 const query_options& options,
                                 gc_clock::time_point now) const override;

+    virtual std::vector<bytes_opt> values_raw(const query_options& options) const = 0;
+
+    virtual std::vector<bytes_opt> values(const query_options& options) const override {
+        std::vector<bytes_opt> ret = values_raw(options);
+        std::sort(ret.begin(),ret.end());
+        ret.erase(std::unique(ret.begin(),ret.end()),ret.end());
+        return ret;
+    }
 #if 0
    @Override
    protected final boolean isSupportedBy(SecondaryIndex index)
@@ -224,7 +232,7 @@ public:
        return abstract_restriction::term_uses_function(_values, ks_name, function_name);
    }

-    virtual std::vector<bytes_opt> values(const query_options& options) const override {
+    virtual std::vector<bytes_opt> values_raw(const query_options& options) const override {
        std::vector<bytes_opt> ret;
        for (auto&& v : _values) {
            ret.emplace_back(to_bytes_opt(v->bind_and_get(options)));
@@ -249,7 +257,7 @@ public:
        return false;
    }

-    virtual std::vector<bytes_opt> values(const query_options& options) const override {
+    virtual std::vector<bytes_opt> values_raw(const query_options& options) const override {
        auto&& lval = dynamic_pointer_cast<multi_item_terminal>(_marker->bind(options));
        if (!lval) {
            throw exceptions::invalid_request_exception("Invalid null value for IN restriction");
--- a/cql3/update_parameters.cc
+++ b/cql3/update_parameters.cc
@@ -53,6 +53,9 @@ update_parameters::get_prefetched_list(
        return {};
    }

+    if (column.is_static()) {
+        ckey = clustering_key_view::make_empty();
+    }
    auto i = _prefetched->rows.find(std::make_pair(std::move(pkey), std::move(ckey)));
    if (i == _prefetched->rows.end()) {
        return {};
--- a/database.cc
+++ b/database.cc
@@ -328,9 +328,13 @@ filter_sstable_for_reader(std::vector<sstables::shared_sstable>&& sstables, colu
    };
    sstables.erase(boost::remove_if(sstables, sstable_has_not_key), sstables.end());

+    // FIXME: Workaround for https://github.com/scylladb/scylla/issues/3552
+    // and https://github.com/scylladb/scylla/issues/3553
+    const bool filtering_broken = true;
+
    // no clustering filtering is applied if schema defines no clustering key or
    // compaction strategy thinks it will not benefit from such an optimization.
-    if (!schema->clustering_key_size() || !cf.get_compaction_strategy().use_clustering_key_filter()) {
+    if (filtering_broken || !schema->clustering_key_size() || !cf.get_compaction_strategy().use_clustering_key_filter()) {
         return sstables;
    }
    ::cf_stats* stats = cf.cf_stats();
@@ -512,9 +516,9 @@ column_family::make_sstable_reader(schema_ptr s,
                                   tracing::trace_state_ptr trace_state,
                                   streamed_mutation::forwarding fwd,
                                   mutation_reader::forwarding fwd_mr) const {
-    auto& config = service::get_local_streaming_read_priority().id() == pc.id()
-        ? _config.streaming_read_concurrency_config
-        : _config.read_concurrency_config;
+    auto* semaphore = service::get_local_streaming_read_priority().id() == pc.id()
+        ? _config.streaming_read_concurrency_semaphore
+        : _config.read_concurrency_semaphore;

    // CAVEAT: if make_sstable_reader() is called on a single partition
    // we want to optimize and read exactly this partition. As a
@@ -526,37 +530,39 @@ column_family::make_sstable_reader(schema_ptr s,
            return make_empty_flat_reader(s); // range doesn't belong to this shard
        }

-        if (config.resources_sem) {
-            auto ms = mutation_source([&config, sstables=std::move(sstables), this] (
+        if (semaphore) {
+            auto ms = mutation_source([semaphore, this, sstables=std::move(sstables)] (
                        schema_ptr s,
                        const dht::partition_range& pr,
                        const query::partition_slice& slice,
                        const io_priority_class& pc,
                        tracing::trace_state_ptr trace_state,
                        streamed_mutation::forwarding fwd,
-                        mutation_reader::forwarding fwd_mr) {
+                        mutation_reader::forwarding fwd_mr,
+                        reader_resource_tracker tracker) {
                    return create_single_key_sstable_reader(const_cast<column_family*>(this), std::move(s), std::move(sstables),
-                                _stats.estimated_sstable_per_read, pr, slice, pc, reader_resource_tracker(config.resources_sem), std::move(trace_state), fwd, fwd_mr);
+                                _stats.estimated_sstable_per_read, pr, slice, pc, tracker, std::move(trace_state), fwd, fwd_mr);
                });
-            return make_restricted_flat_reader(config, std::move(ms), std::move(s), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
+            return make_restricted_flat_reader(*semaphore, std::move(ms), std::move(s), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
        } else {
            return create_single_key_sstable_reader(const_cast<column_family*>(this), std::move(s), std::move(sstables),
                        _stats.estimated_sstable_per_read, pr, slice, pc, no_resource_tracking(), std::move(trace_state), fwd, fwd_mr);
        }
    } else {
-        if (config.resources_sem) {
-            auto ms = mutation_source([&config, sstables=std::move(sstables)] (
+        if (semaphore) {
+            auto ms = mutation_source([semaphore, sstables=std::move(sstables)] (
                        schema_ptr s,
                        const dht::partition_range& pr,
                        const query::partition_slice& slice,
                        const io_priority_class& pc,
                        tracing::trace_state_ptr trace_state,
                        streamed_mutation::forwarding fwd,
-                        mutation_reader::forwarding fwd_mr) {
+                        mutation_reader::forwarding fwd_mr,
+                        reader_resource_tracker tracker) {
                    return make_local_shard_sstable_reader(std::move(s), std::move(sstables), pr, slice, pc,
-                        reader_resource_tracker(config.resources_sem), std::move(trace_state), fwd, fwd_mr);
+                        tracker, std::move(trace_state), fwd, fwd_mr);
                });
-            return make_restricted_flat_reader(config, std::move(ms), std::move(s), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
+            return make_restricted_flat_reader(*semaphore, std::move(ms), std::move(s), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
        } else {
            return make_local_shard_sstable_reader(std::move(s), std::move(sstables), pr, slice, pc,
                no_resource_tracking(), std::move(trace_state), fwd, fwd_mr);
@@ -2001,6 +2007,18 @@ database::database(const db::config& cfg)
    , _memtable_cpu_controller(make_flush_cpu_controller(*_cfg, &_background_writer_scheduling_group, [this, limit = 2.0f * _dirty_memory_manager.throttle_threshold()] {
        return (_dirty_memory_manager.virtual_dirty_memory()) / limit;
    }))
+    , _read_concurrency_sem(max_count_concurrent_reads,
+        max_memory_concurrent_reads(),
+        _cfg->read_request_timeout_in_ms() * 1ms,
+        max_inactive_queue_length(),
+        [this] {
+            ++_stats->sstable_read_queue_overloaded;
+            return std::make_exception_ptr(std::runtime_error("sstable inactive read queue overloaded"));
+        })
+    // No timeouts or queue length limits - a failure here can kill an entire repair.
+    // Trust the caller to limit concurrency.
+    , _streaming_concurrency_sem(max_count_streaming_concurrent_reads, max_memory_streaming_concurrent_reads())
+    , _system_read_concurrency_sem(max_count_system_concurrent_reads, max_memory_system_concurrent_reads())
    , _version(empty_version)
    , _compaction_manager(std::make_unique<compaction_manager>())
    , _enable_incremental_backups(cfg.incremental_backups())
@@ -2132,11 +2150,11 @@ database::setup_metrics() {
                       sm::description("Counts the number of times the sstable read queue was overloaded. "
                                       "A non-zero value indicates that we have to drop read requests because they arrive faster than we can serve them.")),

-        sm::make_gauge("active_reads", [this] { return _stats->active_reads; },
+        sm::make_gauge("active_reads", [this] { return max_count_concurrent_reads - _read_concurrency_sem.available_resources().count; },
                       sm::description("Holds the number of currently active read operations. "),
                       {user_label_instance}),

-        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_concurrent_reads() - _read_concurrency_sem.available_units(); },
+        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_concurrent_reads() - _read_concurrency_sem.available_resources().memory; },
                       sm::description(seastar::format("Holds the amount of memory consumed by currently active read operations. "
                                                       "If this value gets close to {} we are likely to start dropping new read requests. "
                                                       "In that case sstable_read_queue_overloads is going to get a non-zero value.", max_memory_concurrent_reads())),
@@ -2146,12 +2164,12 @@ database::setup_metrics() {
                       sm::description("Holds the number of currently queued read operations."),
                       {user_label_instance}),

-        sm::make_gauge("active_reads", [this] { return _stats->active_reads_streaming; },
+        sm::make_gauge("active_reads", [this] { return max_count_streaming_concurrent_reads - _streaming_concurrency_sem.available_resources().count; },
                       sm::description("Holds the number of currently active read operations issued on behalf of streaming "),
                       {streaming_label_instance}),


-        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_streaming_concurrent_reads() - _streaming_concurrency_sem.available_units(); },
+        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_streaming_concurrent_reads() - _streaming_concurrency_sem.available_resources().memory; },
                       sm::description(seastar::format("Holds the amount of memory consumed by currently active read operations issued on behalf of streaming "
                                                       "If this value gets close to {} we are likely to start dropping new read requests. "
                                                       "In that case sstable_read_queue_overloads is going to get a non-zero value.", max_memory_streaming_concurrent_reads())),
@@ -2161,11 +2179,11 @@ database::setup_metrics() {
                       sm::description("Holds the number of currently queued read operations on behalf of streaming."),
                       {streaming_label_instance}),

-        sm::make_gauge("active_reads", [this] { return _stats->active_reads_system_keyspace; },
+        sm::make_gauge("active_reads", [this] { return max_count_system_concurrent_reads - _system_read_concurrency_sem.available_resources().count; },
                       sm::description("Holds the number of currently active read operations from \"system\" keyspace tables. "),
                       {system_label_instance}),

-        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_system_concurrent_reads() - _system_read_concurrency_sem.available_units(); },
+        sm::make_gauge("active_reads_memory_consumption", [this] { return max_memory_system_concurrent_reads() - _system_read_concurrency_sem.available_resources().memory; },
                       sm::description(seastar::format("Holds the amount of memory consumed by currently active read operations from \"system\" keyspace tables. "
                                                       "If this value gets close to {} we are likely to start dropping new read requests. "
                                                       "In that case sstable_read_queue_overloads is going to get a non-zero value.", max_memory_system_concurrent_reads())),
@@ -2647,8 +2665,8 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
    cfg.enable_cache = _config.enable_cache;
    cfg.dirty_memory_manager = _config.dirty_memory_manager;
    cfg.streaming_dirty_memory_manager = _config.streaming_dirty_memory_manager;
-    cfg.read_concurrency_config = _config.read_concurrency_config;
-    cfg.streaming_read_concurrency_config = _config.streaming_read_concurrency_config;
+    cfg.read_concurrency_semaphore = _config.read_concurrency_semaphore;
+    cfg.streaming_read_concurrency_semaphore = _config.streaming_read_concurrency_semaphore;
    cfg.cf_stats = _config.cf_stats;
    cfg.enable_incremental_backups = _config.enable_incremental_backups;
    cfg.background_writer_scheduling_group = _config.background_writer_scheduling_group;
@@ -3386,18 +3404,8 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
    }
    cfg.dirty_memory_manager = &_dirty_memory_manager;
    cfg.streaming_dirty_memory_manager = &_streaming_dirty_memory_manager;
-    cfg.read_concurrency_config.resources_sem = &_read_concurrency_sem;
-    cfg.read_concurrency_config.active_reads = &_stats->active_reads;
-    cfg.read_concurrency_config.timeout = _cfg->read_request_timeout_in_ms() * 1ms;
-    cfg.read_concurrency_config.max_queue_length = 100;
-    cfg.read_concurrency_config.raise_queue_overloaded_exception = [this] {
-        ++_stats->sstable_read_queue_overloaded;
-        throw std::runtime_error("sstable inactive read queue overloaded");
-    };
-    // No timeouts or queue length limits - a failure here can kill an entire repair.
-    // Trust the caller to limit concurrency.
-    cfg.streaming_read_concurrency_config.resources_sem = &_streaming_concurrency_sem;
-    cfg.streaming_read_concurrency_config.active_reads = &_stats->active_reads_streaming;
+    cfg.read_concurrency_semaphore = &_read_concurrency_sem;
+    cfg.streaming_read_concurrency_semaphore = &_streaming_concurrency_sem;
    cfg.cf_stats = &_cf_stats;
    cfg.enable_incremental_backups = _enable_incremental_backups;

@@ -4234,16 +4242,14 @@ flat_mutation_reader make_local_shard_sstable_reader(schema_ptr s,
        }
        return reader;
    };
-    return make_combined_reader(s, std::make_unique<incremental_reader_selector>(s,
-                    std::move(sstables),
-                    pr,
-                    slice,
-                    pc,
-                    std::move(resource_tracker),
-                    std::move(trace_state),
-                    fwd,
-                    fwd_mr,
-                    std::move(reader_factory_fn)),
+    auto all_readers = boost::copy_range<std::vector<flat_mutation_reader>>(
+            *sstables->all()
+            | boost::adaptors::transformed([&] (sstables::shared_sstable sst) -> flat_mutation_reader {
+                return reader_factory_fn(sst, pr);
+            })
+    );
+    return make_combined_reader(s,
+            std::move(all_readers),
            fwd,
            fwd_mr);
 }
@@ -4261,16 +4267,14 @@ flat_mutation_reader make_range_sstable_reader(schema_ptr s,
    auto reader_factory_fn = [s, &slice, &pc, resource_tracker, fwd, fwd_mr] (sstables::shared_sstable& sst, const dht::partition_range& pr) {
        return sst->read_range_rows_flat(s, pr, slice, pc, resource_tracker, fwd, fwd_mr);
    };
-    return make_combined_reader(s, std::make_unique<incremental_reader_selector>(s,
-                    std::move(sstables),
-                    pr,
-                    slice,
-                    pc,
-                    std::move(resource_tracker),
-                    std::move(trace_state),
-                    fwd,
-                    fwd_mr,
-                    std::move(reader_factory_fn)),
+    auto sstable_readers = boost::copy_range<std::vector<flat_mutation_reader>>(
+            *sstables->all()
+            | boost::adaptors::transformed([&] (sstables::shared_sstable sst) {
+                return reader_factory_fn(sst, pr);
+            })
+    );
+    return make_combined_reader(s,
+            std::move(sstable_readers),
            fwd,
            fwd_mr);
 }
--- a/database.hh
+++ b/database.hh
@@ -79,7 +79,7 @@
 #include "utils/phased_barrier.hh"
 #include "cpu_controller.hh"
 #include "dirty_memory_manager.hh"
-#include "reader_resource_tracker.hh"
+#include "reader_concurrency_semaphore.hh"

 class cell_locker;
 class cell_locker_stats;
@@ -296,8 +296,8 @@ public:
        bool enable_incremental_backups = false;
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
-        restricted_mutation_reader_config read_concurrency_config;
-        restricted_mutation_reader_config streaming_read_concurrency_config;
+        reader_concurrency_semaphore* read_concurrency_semaphore;
+        reader_concurrency_semaphore* streaming_read_concurrency_semaphore;
        ::cf_stats* cf_stats = nullptr;
        seastar::thread_scheduling_group* background_writer_scheduling_group = nullptr;
        seastar::thread_scheduling_group* memtable_scheduling_group = nullptr;
@@ -954,8 +954,8 @@ public:
        bool enable_incremental_backups = false;
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
-        restricted_mutation_reader_config read_concurrency_config;
-        restricted_mutation_reader_config streaming_read_concurrency_config;
+        reader_concurrency_semaphore* read_concurrency_semaphore;
+        reader_concurrency_semaphore* streaming_read_concurrency_semaphore;
        ::cf_stats* cf_stats = nullptr;
        seastar::thread_scheduling_group* background_writer_scheduling_group = nullptr;
        seastar::thread_scheduling_group* memtable_scheduling_group = nullptr;
@@ -1041,10 +1041,17 @@ public:
    using timeout_clock = lowres_clock;
 private:
    ::cf_stats _cf_stats;
+    static const size_t max_count_concurrent_reads{100};
    static size_t max_memory_concurrent_reads() { return memory::stats().total_memory() * 0.02; }
+    // Assume a queued read takes up 10kB of memory, and allow 2% of memory to be filled up with such reads.
+    static size_t max_inactive_queue_length() { return memory::stats().total_memory() * 0.02 / 10000; }
+    // They're rather heavyweight, so limit more
+    static const size_t max_count_streaming_concurrent_reads{10};
    static size_t max_memory_streaming_concurrent_reads() { return memory::stats().total_memory() * 0.02; }
+    static const size_t max_count_system_concurrent_reads{10};
    static size_t max_memory_system_concurrent_reads() { return memory::stats().total_memory() * 0.02; };
    static constexpr size_t max_concurrent_sstable_loads() { return 3; }
+
    struct db_stats {
        uint64_t total_writes = 0;
        uint64_t total_writes_failed = 0;
@@ -1053,10 +1060,6 @@ private:
        uint64_t total_reads_failed = 0;
        uint64_t sstable_read_queue_overloaded = 0;

-        uint64_t active_reads = 0;
-        uint64_t active_reads_streaming = 0;
-        uint64_t active_reads_system_keyspace = 0;
-
        uint64_t short_data_queries = 0;
        uint64_t short_mutation_queries = 0;
    };
@@ -1073,11 +1076,9 @@ private:
    seastar::thread_scheduling_group _background_writer_scheduling_group;
    flush_cpu_controller _memtable_cpu_controller;

-    semaphore _read_concurrency_sem{max_memory_concurrent_reads()};
-    semaphore _streaming_concurrency_sem{max_memory_streaming_concurrent_reads()};
-    restricted_mutation_reader_config _read_concurrency_config;
-    semaphore _system_read_concurrency_sem{max_memory_system_concurrent_reads()};
-    restricted_mutation_reader_config _system_read_concurrency_config;
+    reader_concurrency_semaphore _read_concurrency_sem;
+    reader_concurrency_semaphore _streaming_concurrency_sem;
+    reader_concurrency_semaphore _system_read_concurrency_sem;

    semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};

@@ -1245,7 +1246,7 @@ public:
    std::unordered_set<sstring> get_initial_tokens();
    std::experimental::optional<gms::inet_address> get_replace_address();
    bool is_replacing();
-    semaphore& system_keyspace_read_concurrency_sem() {
+    reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
        return _system_read_concurrency_sem;
    }
    semaphore& sstable_load_concurrency_sem() {
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -718,7 +718,7 @@ public:
         */
        auto me = shared_from_this();
        auto fp = _file_pos;
-        return _pending_ops.wait_for_pending(timeout).then([me = std::move(me), fp, timeout] {
+        return _pending_ops.wait_for_pending(timeout).then([me, fp, timeout] {
            if (fp != me->_file_pos) {
                // some other request already wrote this buffer.
                // If so, wait for the operation at our intended file offset
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -64,8 +64,11 @@
 #include "db/config.hh"
 #include "md5_hasher.hh"

+#include <seastar/util/noncopyable_function.hh>
+
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/range/algorithm/copy.hpp>
+#include <boost/range/algorithm/transform.hpp>
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/join.hpp>

@@ -126,7 +129,11 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
    std::map<qualified_name, schema_mutations>&& views_before,
    std::map<qualified_name, schema_mutations>&& views_after);

-static void merge_types(distributed<service::storage_proxy>& proxy,
+struct user_types_to_drop final {
+    seastar::noncopyable_function<void()> drop;
+};
+
+[[nodiscard]] static user_types_to_drop merge_types(distributed<service::storage_proxy>& proxy,
    schema_result&& before,
    schema_result&& after);

@@ -832,7 +839,7 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
 #endif

       std::set<sstring> keyspaces_to_drop = merge_keyspaces(proxy, std::move(old_keyspaces), std::move(new_keyspaces)).get0();
-       merge_types(proxy, std::move(old_types), std::move(new_types));
+       auto types_to_drop = merge_types(proxy, std::move(old_types), std::move(new_types));
       merge_tables_and_views(proxy,
            std::move(old_column_families), std::move(new_column_families),
            std::move(old_views), std::move(new_views));
@@ -840,6 +847,8 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
       mergeFunctions(oldFunctions, newFunctions);
       mergeAggregates(oldAggregates, newAggregates);
 #endif
+       types_to_drop.drop();
+
       proxy.local().get_db().invoke_on_all([keyspaces_to_drop = std::move(keyspaces_to_drop)] (database& db) {
           // it is safe to drop a keyspace only when all nested ColumnFamilies where deleted
           return do_for_each(keyspaces_to_drop, [&db] (auto keyspace_to_drop) {
@@ -996,30 +1005,37 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
    }).get();
 }

-static inline void collect_types(std::set<sstring>& keys, schema_result& result, std::vector<user_type>& to)
+struct naked_user_type {
+    const sstring keyspace;
+    const sstring qualified_name;
+};
+
+static inline void collect_types(std::set<sstring>& keys, schema_result& result, std::vector<naked_user_type>& to)
 {
    for (auto&& key : keys) {
        auto&& value = result[key];
        auto types = create_types_from_schema_partition(schema_result_value_type{key, std::move(value)});
-        std::move(types.begin(), types.end(), std::back_inserter(to));
+        boost::transform(types, std::back_inserter(to), [] (user_type type) {
+            return naked_user_type{std::move(type->_keyspace), std::move(type->name())};
+        });
    }
 }

- // see the comments for merge_keyspaces()
-static void merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
+// see the comments for merge_keyspaces()
+[[nodiscard]] static user_types_to_drop merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
 {
-    std::vector<user_type> created, altered, dropped;
+    std::vector<naked_user_type> created, altered, dropped;

    auto diff = difference(before, after, indirect_equal_to<lw_shared_ptr<query::result_set>>());

    collect_types(diff.entries_only_on_left, before, dropped); // Keyspaces with no more types
    collect_types(diff.entries_only_on_right, after, created); // New keyspaces with types

-    for (auto&& key : diff.entries_differing) {
+    for (auto&& keyspace : diff.entries_differing) {
        // The user types of this keyspace differ, so diff the current types with the updated ones
-        auto current_types = proxy.local().get_db().local().find_keyspace(key).metadata()->user_types()->get_all_types();
+        auto current_types = proxy.local().get_db().local().find_keyspace(keyspace).metadata()->user_types()->get_all_types();
        decltype(current_types) updated_types;
-        auto ts = create_types_from_schema_partition(schema_result_value_type{key, std::move(after[key])});
+        auto ts = create_types_from_schema_partition(schema_result_value_type{keyspace, std::move(after[keyspace])});
        updated_types.reserve(ts.size());
        for (auto&& type : ts) {
            updated_types[type->_name] = std::move(type);
@@ -1027,36 +1043,46 @@ static void merge_types(distributed<service::storage_proxy>& proxy, schema_resul

        auto delta = difference(current_types, updated_types, indirect_equal_to<user_type>());

-        for (auto&& key : delta.entries_only_on_left) {
-            dropped.emplace_back(current_types[key]);
+        for (auto&& type_name : delta.entries_only_on_left) {
+            dropped.emplace_back(naked_user_type{keyspace, current_types[type_name]->name()});
        }
-        for (auto&& key : delta.entries_only_on_right) {
-            created.emplace_back(std::move(updated_types[key]));
+        for (auto&& type_name : delta.entries_only_on_right) {
+            created.emplace_back(naked_user_type{keyspace, updated_types[type_name]->name()});
        }
-        for (auto&& key : delta.entries_differing) {
-            altered.emplace_back(std::move(updated_types[key]));
+        for (auto&& type_name : delta.entries_differing) {
+            altered.emplace_back(naked_user_type{keyspace, updated_types[type_name]->name()});
        }
    }

-    proxy.local().get_db().invoke_on_all([&created, &dropped, &altered] (database& db) {
+    // Create and update user types before any tables/views are created that potentially
+    // use those types. Similarly, defer dropping until after tables/views that may use
+    // some of these user types are dropped.
+
+    proxy.local().get_db().invoke_on_all([&created, &altered] (database& db) {
        return seastar::async([&] {
            for (auto&& type : created) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type.qualified_name));
                db.find_keyspace(user_type->_keyspace).add_user_type(user_type);
                service::get_local_migration_manager().notify_create_user_type(user_type).get();
            }
-            for (auto&& type : dropped) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
-                db.find_keyspace(user_type->_keyspace).remove_user_type(user_type);
-                service::get_local_migration_manager().notify_drop_user_type(user_type).get();
-            }
            for (auto&& type : altered) {
-                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type.qualified_name));
                db.find_keyspace(user_type->_keyspace).add_user_type(user_type);
                service::get_local_migration_manager().notify_update_user_type(user_type).get();
            }
        });
    }).get();
+
+    return user_types_to_drop{[&proxy, dropped = std::move(dropped)] {
+        proxy.local().get_db().invoke_on_all([dropped = std::move(dropped)](database& db) {
+            return do_for_each(dropped, [&db](auto& user_type_to_drop) {
+                auto user_type = dynamic_pointer_cast<const user_type_impl>(
+                        parse_type(std::move(user_type_to_drop.qualified_name)));
+                db.find_keyspace(user_type->_keyspace).remove_user_type(user_type);
+                return service::get_local_migration_manager().notify_drop_user_type(user_type);
+            });
+        }).get();
+    }};
 }

 #if 0
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1577,10 +1577,7 @@ void make(database& db, bool durable, bool volatile_testing_only) {
            kscfg.enable_commitlog = !volatile_testing_only;
            kscfg.enable_cache = true;
            // don't make system keyspace reads wait for user reads
-            kscfg.read_concurrency_config.resources_sem = &db.system_keyspace_read_concurrency_sem();
-            kscfg.read_concurrency_config.active_reads = &db.get_stats().active_reads_system_keyspace;
-            kscfg.read_concurrency_config.timeout = {};
-            kscfg.read_concurrency_config.max_queue_length = std::numeric_limits<size_t>::max();
+            kscfg.read_concurrency_semaphore = &db.system_keyspace_read_concurrency_sem();
            // don't make system keyspace writes wait for user writes (if under pressure)
            kscfg.dirty_memory_manager = &db._system_dirty_memory_manager;
            keyspace _ks{ksm, std::move(kscfg)};
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -175,6 +175,31 @@ static bool update_requires_read_before_write(const schema& base,
    return false;
 }

+static bool is_partition_key_empty(
+        const schema& base,
+        const schema& view_schema,
+        const partition_key& base_key,
+        const clustering_row& update) {
+    // Empty partition keys are not supported on normal tables - they cannot
+    // be inserted or queried, so enforce those rules here.
+    if (view_schema.partition_key_columns().size() > 1) {
+        // Composite partition keys are different: all components
+        // are then allowed to be empty.
+        return false;
+    }
+    auto* base_col = base.get_column_definition(view_schema.partition_key_columns().front().name());
+    switch (base_col->kind) {
+    case column_kind::partition_key:
+        return base_key.get_component(base, base_col->position()).empty();
+    case column_kind::clustering_key:
+        return update.key().get_component(base, base_col->position()).empty();
+    default:
+        // No multi-cell columns in the view's partition key
+        auto& c = update.cells().cell_at(base_col->id);
+        return c.as_atomic_cell().value().empty();
+    }
+}
+
 bool matches_view_filter(const schema& base, const view_info& view, const partition_key& key, const clustering_row& update, gc_clock::time_point now) {
    return clustering_prefix_matches(base, view, key, update.key())
            && boost::algorithm::all_of(
@@ -330,7 +355,7 @@ static void add_cells_to_view(const schema& base, const schema& view, const row&
 * This method checks that the base row does match the view filter before applying anything.
 */
 void view_updates::create_entry(const partition_key& base_key, const clustering_row& update, gc_clock::time_point now) {
-    if (!matches_view_filter(*_base, _view_info, base_key, update, now)) {
+    if (is_partition_key_empty(*_base, *_view, base_key, update) || !matches_view_filter(*_base, _view_info, base_key, update, now)) {
        return;
    }
    deletable_row& r = get_view_row(base_key, update);
@@ -346,7 +371,7 @@ void view_updates::create_entry(const partition_key& base_key, const clustering_
 void view_updates::delete_old_entry(const partition_key& base_key, const clustering_row& existing, const row_tombstone& t, gc_clock::time_point now) {
    // Before deleting an old entry, make sure it was matching the view filter
    // (otherwise there is nothing to delete)
-    if (matches_view_filter(*_base, _view_info, base_key, existing, now)) {
+    if (!is_partition_key_empty(*_base, *_view, base_key, existing) && matches_view_filter(*_base, _view_info, base_key, existing, now)) {
        do_delete_old_entry(base_key, existing, t, now);
    }
 }
@@ -391,11 +416,11 @@ void view_updates::do_delete_old_entry(const partition_key& base_key, const clus
 void view_updates::update_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now) {
    // While we know update and existing correspond to the same view entry,
    // they may not match the view filter.
-    if (!matches_view_filter(*_base, _view_info, base_key, existing, now)) {
+    if (is_partition_key_empty(*_base, *_view, base_key, existing) || !matches_view_filter(*_base, _view_info, base_key, existing, now)) {
        create_entry(base_key, update, now);
        return;
    }
-    if (!matches_view_filter(*_base, _view_info, base_key, update, now)) {
+    if (is_partition_key_empty(*_base, *_view, base_key, update) || !matches_view_filter(*_base, _view_info, base_key, update, now)) {
        do_delete_old_entry(base_key, existing, row_tombstone(), now);
        return;
    }
--- a/dht/range_streamer.cc
+++ b/dht/range_streamer.cc
@@ -300,6 +300,7 @@ future<> range_streamer::do_stream_async() {
                unsigned sp_index = 0;
                unsigned nr_ranges_streamed = 0;
                size_t nr_ranges_total = range_vec.size();
+                size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
                dht::token_range_vector ranges_to_stream;
                auto do_streaming = [&] {
                    auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++));
@@ -318,7 +319,7 @@ future<> range_streamer::do_stream_async() {
                        ranges_to_stream.push_back(*it);
                        it = range_vec.erase(it);
                        nr_ranges_streamed++;
-                        if (ranges_to_stream.size() < _nr_ranges_per_stream_plan) {
+                        if (ranges_to_stream.size() < nr_ranges_per_stream_plan) {
                            continue;
                        } else {
                            do_streaming();
--- a/dht/range_streamer.hh
+++ b/dht/range_streamer.hh
@@ -174,8 +174,6 @@ private:
    std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
    stream_plan _stream_plan;
    std::unordered_map<sstring, std::vector<sstring>> _column_families;
-    // Number of ranges to stream per stream plan
-    unsigned _nr_ranges_per_stream_plan = 10;
    // Retry the stream plan _nr_max_retry times
    unsigned _nr_retried = 0;
    unsigned _nr_max_retry = 5;
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -43,7 +43,7 @@ done
 . /etc/os-release
 case "$ID" in
    "centos")
-        AMI=ami-46bf8a51
+        AMI=ami-ae7bfdb8
        REGION=us-east-1
        SSH_USERNAME=centos
        ;;
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/modprobe.d/scylla-raid0.conf
+++ b/dist/common/modprobe.d/scylla-raid0.conf
@@ -1 +0,0 @@
-options raid0 devices_discard_performance=Y
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -112,10 +112,15 @@ run_setup_script() {
    name=$1
    shift 1
    $* &&:
-    if [ $? -ne 0 ] && [ $INTERACTIVE -eq 1 ]; then
-        printf "${RED}$name setup failed. press any key to continue...${NO_COLOR}\n"
-        read
-        return 1
+    if [ $? -ne 0 ]; then
+        if [ $INTERACTIVE -eq 1 ]; then
+            printf "${RED}$name setup failed. press any key to continue...${NO_COLOR}\n"
+            read
+            return 1
+        else
+            printf "$name setup failed.\n"
+            exit 1
+        fi
    fi
    return 0
 }
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -2,10 +2,11 @@

 . /etc/os-release
 print_usage() {
-    echo "build_deb.sh -target <codename> --dist --rebuild-dep"
+    echo "build_deb.sh -target <codename> --dist --rebuild-dep --jobs 2"
    echo "  --target target distribution codename"
    echo "  --dist  create a public distribution package"
    echo "  --no-clean  don't rebuild pbuilder tgz"
+    echo "  --jobs  specify number of jobs"
    exit 1
 }
 install_deps() {
@@ -19,6 +20,7 @@ install_deps() {
 DIST=0
 TARGET=
 NO_CLEAN=0
+JOBS=0
 while [ $# -gt 0 ]; do
    case "$1" in
        "--dist")
@@ -33,6 +35,10 @@ while [ $# -gt 0 ]; do
            NO_CLEAN=1
            shift 1
            ;;
+        "--jobs")
+            JOBS=$2
+            shift 2
+            ;;
        *)
            print_usage
            ;;
@@ -127,6 +133,8 @@ if [ "$TARGET" = "jessie" ]; then
    cp dist/debian/scylla-server.cron.d debian/
    sed -i -e "s/@@REVISION@@/1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping-daily/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@/dh_installinit --no-start --name scylla-housekeeping-restart/g" debian/rules
    sed -i -e "s#@@COMPILER@@#/opt/scylladb/bin/g++-7#g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, scylla-gcc72-g++-7, libunwind-dev, scylla-antlr35, scylla-libthrift010-dev, scylla-antlr35-c++-dev, scylla-libboost-program-options163-dev, scylla-libboost-filesystem163-dev, scylla-libboost-system163-dev, scylla-libboost-thread163-dev, scylla-libboost-test163-dev/g" debian/control
    sed -i -e "s/@@DEPENDS@@//g" debian/control
@@ -141,6 +149,8 @@ elif [ "$TARGET" = "stretch" ]; then
    cp dist/debian/scylla-server.cron.d debian/
    sed -i -e "s/@@REVISION@@/1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping-daily/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@/dh_installinit --no-start --name scylla-housekeeping-restart/g" debian/rules
    sed -i -e "s#@@COMPILER@@#/opt/scylladb/bin/g++-7#g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, scylla-gcc72-g++-7, libunwind-dev, antlr3, scylla-libthrift010-dev, scylla-antlr35-c++-dev, libboost-program-options1.62-dev, libboost-filesystem1.62-dev, libboost-system1.62-dev, libboost-thread1.62-dev, libboost-test1.62-dev/g" debian/control
    sed -i -e "s/@@DEPENDS@@//g" debian/control
@@ -155,6 +165,8 @@ elif [ "$TARGET" = "trusty" ]; then
    cp dist/debian/scylla-server.cron.d debian/
    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@/--upstart-only/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping --upstart-only/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@//g" debian/rules
    sed -i -e "s#@@COMPILER@@#/opt/scylladb/bin/g++-7#g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/scylla-gcc72-g++-7, libunwind8-dev, scylla-antlr35, scylla-libthrift010-dev, scylla-antlr35-c++-dev, scylla-libboost-program-options163-dev, scylla-libboost-filesystem163-dev, scylla-libboost-system163-dev, scylla-libboost-thread163-dev, scylla-libboost-test163-dev/g" debian/control
    sed -i -e "s/@@DEPENDS@@/hugepages, num-utils/g" debian/control
@@ -168,6 +180,8 @@ elif [ "$TARGET" = "trusty" ]; then
 elif [ "$TARGET" = "xenial" ]; then
    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping-daily/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@/dh_installinit --no-start --name scylla-housekeeping-restart/g" debian/rules
    sed -i -e "s#@@COMPILER@@#/opt/scylladb/bin/g++-7#g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, scylla-gcc72-g++-7, libunwind-dev, antlr3, scylla-libthrift010-dev, scylla-antlr35-c++-dev, scylla-libboost-program-options163-dev, scylla-libboost-filesystem163-dev, scylla-libboost-system163-dev, scylla-libboost-thread163-dev, scylla-libboost-test163-dev/g" debian/control
    sed -i -e "s/@@DEPENDS@@/hugepages, /g" debian/control
@@ -181,6 +195,8 @@ elif [ "$TARGET" = "xenial" ]; then
 elif [ "$TARGET" = "bionic" ]; then
    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping-daily/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@/dh_installinit --no-start --name scylla-housekeeping-restart/g" debian/rules
    sed -i -e "s#@@COMPILER@@#g++-7#g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind-dev, antlr3, scylla-libthrift010-dev, scylla-antlr35-c++-dev, libboost-program-options-dev, libboost-filesystem-dev, libboost-system-dev, libboost-thread-dev, libboost-test-dev/g" debian/control
    sed -i -e "s/@@DEPENDS@@/hugepages, /g" debian/control
@@ -194,6 +210,8 @@ elif [ "$TARGET" = "bionic" ]; then
 elif [ "$TARGET" = "yakkety" ] || [ "$TARGET" = "zesty" ] || [ "$TARGET" = "artful" ]; then
    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_DAILY_INIT@@/dh_installinit --no-start --name scylla-housekeeping-daily/g" debian/rules
+    sed -i -e "s/@@INSTALL_HK_RESTART_INIT@@/dh_installinit --no-start --name scylla-housekeeping-restart/g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++-7/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++-7, libunwind-dev, antlr3, scylla-libthrift010-dev, scylla-antlr35-c++-dev, libboost-program-options-dev, libboost-filesystem-dev, libboost-system-dev, libboost-thread-dev, libboost-test-dev/g" debian/control
    sed -i -e "s/@@DEPENDS@@/hugepages, /g" debian/control
@@ -221,16 +239,19 @@ sed -i -e "s#@@REPOFILES@@#'/etc/apt/sources.list.d/scylla*.list'#g" debian/scyl
 cp dist/common/systemd/scylla-fstrim.service debian/scylla-server.scylla-fstrim.service
 cp dist/common/systemd/node-exporter.service debian/scylla-server.node-exporter.service

-cp ./dist/debian/pbuilderrc ~/.pbuilderrc
+sudo cp ./dist/debian/pbuilderrc ~root/.pbuilderrc
 if [ $NO_CLEAN -eq 0 ]; then
    sudo rm -fv /var/cache/pbuilder/scylla-server-$TARGET.tgz
-    sudo -E DIST=$TARGET /usr/sbin/pbuilder clean
-    sudo -E DIST=$TARGET /usr/sbin/pbuilder create --allow-untrusted
+    sudo -H DIST=$TARGET /usr/sbin/pbuilder clean
+    sudo -H DIST=$TARGET /usr/sbin/pbuilder create --allow-untrusted
 fi
-sudo -E DIST=$TARGET /usr/sbin/pbuilder update --allow-untrusted
+if [ $JOBS -ne 0 ]; then
+    DEB_BUILD_OPTIONS="parallel=$JOBS"
+fi
+sudo -H DIST=$TARGET /usr/sbin/pbuilder update --allow-untrusted
 if [ "$TARGET" = "trusty" ] || [ "$TARGET" = "xenial" ] || [ "$TARGET" = "yakkety" ] || [ "$TARGET" = "zesty" ] || [ "$TARGET" = "artful" ] || [ "$TARGET" = "bionic" ]; then
-    sudo -E DIST=$TARGET /usr/sbin/pbuilder execute --save-after-exec dist/debian/ubuntu_enable_ppa.sh
+    sudo -H DIST=$TARGET /usr/sbin/pbuilder execute --save-after-exec dist/debian/ubuntu_enable_ppa.sh
 elif [ "$TARGET" = "jessie" ] || [ "$TARGET" = "stretch" ]; then
-    sudo -E DIST=$TARGET /usr/sbin/pbuilder execute --save-after-exec dist/debian/debian_install_gpgkey.sh
+    sudo -H DIST=$TARGET /usr/sbin/pbuilder execute --save-after-exec dist/debian/debian_install_gpgkey.sh
 fi
-sudo -E DIST=$TARGET pdebuild --buildresult build/debs
+sudo -H DIST=$TARGET DEB_BUILD_OPTIONS=$DEB_BUILD_OPTIONS pdebuild --buildresult build/debs
--- a/dist/debian/rules.in
+++ b/dist/debian/rules.in
@@ -1,12 +1,13 @@
 #!/usr/bin/make -f

 export PYBUILD_DISABLE=1
+jobs := $(shell echo $$DEB_BUILD_OPTIONS | sed -r "s/.*parallel=([0-9]+).*/-j\1/")

 override_dh_auto_configure:
 	./configure.py --enable-dpdk --mode=release --static-thrift --static-boost --compiler=@@COMPILER@@ --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"

 override_dh_auto_build:
-	PATH="/opt/scylladb/bin:$$PATH" ninja
+	PATH="/opt/scylladb/bin:$$PATH" ninja $(jobs)

 override_dh_auto_clean:
 	rm -rf build/release seastar/build
@@ -15,8 +16,8 @@ override_dh_auto_clean:

 override_dh_installinit:
 	dh_installinit --no-start @@DH_INSTALLINIT@@
-	dh_installinit --no-start --name scylla-housekeeping-daily @@DH_INSTALLINIT@@
-	dh_installinit --no-start --name scylla-housekeeping-restart @@DH_INSTALLINIT@@
+	@@INSTALL_HK_DAILY_INIT@@
+	@@INSTALL_HK_RESTART_INIT@@
 	dh_installinit --no-start --name scylla-fstrim @@DH_INSTALLINIT@@
 	dh_installinit --no-start --name node-exporter @@DH_INSTALLINIT@@

--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -14,8 +14,10 @@ ADD etc/sysconfig/scylla-server /etc/sysconfig/scylla-server
 # Supervisord configuration:
 ADD etc/supervisord.conf /etc/supervisord.conf
 ADD etc/supervisord.conf.d/scylla-server.conf /etc/supervisord.conf.d/scylla-server.conf
+ADD etc/supervisord.conf.d/scylla-housekeeping.conf /etc/supervisord.conf.d/scylla-housekeeping.conf
 ADD etc/supervisord.conf.d/scylla-jmx.conf /etc/supervisord.conf.d/scylla-jmx.conf
 ADD scylla-service.sh /scylla-service.sh
+ADD scylla-housekeeping-service.sh /scylla-housekeeping-service.sh
 ADD scylla-jmx-service.sh /scylla-jmx-service.sh

 # Docker image startup scripts:
--- a/dist/docker/redhat/commandlineparser.py
+++ b/dist/docker/redhat/commandlineparser.py
@@ -14,4 +14,5 @@ def parse():
    parser.add_argument('--broadcast-address', default=None, dest='broadcastAddress')
    parser.add_argument('--broadcast-rpc-address', default=None, dest='broadcastRpcAddress')
    parser.add_argument('--api-address', default=None, dest='apiAddress')
+    parser.add_argument('--disable-version-check', default=False, action='store_true', dest='disable_housekeeping', help="Disable version check")
    return parser.parse_args()
--- a/dist/docker/redhat/docker-entrypoint.py
+++ b/dist/docker/redhat/docker-entrypoint.py
@@ -15,6 +15,7 @@ try:
    setup.io()
    setup.cqlshrc()
    setup.arguments()
+    setup.set_housekeeping()
    os.system("/usr/bin/supervisord -c /etc/supervisord.conf")
 except:
    logging.exception('failed!')
--- a/dist/docker/redhat/etc/supervisord.conf.d/scylla-housekeeping.conf
+++ b/dist/docker/redhat/etc/supervisord.conf.d/scylla-housekeeping.conf
@@ -0,0 +1,6 @@
+[program:scylla-housekeeping]
+command=/scylla-housekeeping-service.sh
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
--- a/dist/docker/redhat/scylla-housekeeping-service.sh
+++ b/dist/docker/redhat/scylla-housekeeping-service.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+sleep 5
+/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/yum.repos.d/scylla*.repo'  -q version --mode cr || true
+while true; do
+    sleep 1d
+    /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/yum.repos.d/scylla*.repo' -q version --mode cd || true
+done
+
--- a/dist/docker/redhat/scyllasetup.py
+++ b/dist/docker/redhat/scyllasetup.py
@@ -15,6 +15,7 @@ class ScyllaSetup:
        self._smp = arguments.smp
        self._memory = arguments.memory
        self._overprovisioned = arguments.overprovisioned
+        self._housekeeping = not arguments.disable_housekeeping
        self._experimental = arguments.experimental

    def _run(self, *args, **kwargs):
@@ -38,6 +39,14 @@ class ScyllaSetup:
        with open("%s/.cqlshrc" % home, "w") as cqlshrc:
            cqlshrc.write("[connection]\nhostname = %s\n" % hostname)

+    def set_housekeeping(self):
+        with open("/etc/scylla.d/housekeeping.cfg", "w") as f:
+            f.write("[housekeeping]\ncheck-version: ")
+            if self._housekeeping:
+                f.write("True\n")
+            else:
+                f.write("False\n")
+
    def arguments(self):
        args = []
        if self._memory is not None:
--- a/dist/redhat/build_rpm.sh
+++ b/dist/redhat/build_rpm.sh
@@ -102,11 +102,11 @@ fi


 if [ $JOBS -gt 0 ]; then
-    SRPM_OPTS="$SRPM_OPTS --define='_smp_mflags -j$JOBS'"
+    RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
 fi
-sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS
+sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
 if [ "$TARGET" = "epel-7-x86_64" ]; then
    TARGET=scylla-$TARGET
    RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
 fi
-sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS build/srpms/scylla-$VERSION*.src.rpm
+sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/scylla-$VERSION*.src.rpm
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -92,9 +92,6 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
-%if 0%{?rhel}
-mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
-%endif
 mkdir -p $RPM_BUILD_ROOT%{_sysctldir}/
 mkdir -p $RPM_BUILD_ROOT%{_docdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_unitdir}
@@ -105,9 +102,6 @@ install -m644 dist/common/limits.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/sec
 install -m644 dist/common/collectd.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 install -m644 dist/common/scylla.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
 install -m644 dist/common/sysctl.d/*.conf $RPM_BUILD_ROOT%{_sysctldir}/
-%if 0%{?rhel}
-install -m644 dist/common/modprobe.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
-%endif
 install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
 install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
@@ -300,18 +294,9 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
-# Write modprobe.d params when module already loaded
-%if 0%{?rhel}
-if [ -e /sys/module/raid0/parameters/devices_discard_performance ]; then
-    echo Y > /sys/module/raid0/parameters/devices_discard_performance
-fi
-%endif

 %files kernel-conf
 %defattr(-,root,root)
-%if 0%{?rhel}
-%config(noreplace) %{_sysconfdir}/modprobe.d/*.conf
-%endif
 %{_sysctldir}/*.conf

 %changelog
--- a/docs/docker-hub.md
+++ b/docs/docker-hub.md
@@ -208,6 +208,12 @@ $ docker run --name some-scylla -d scylladb/scylla --experimental 1

 **Since: 2.0**

+### `--disable-version-check`
+
+The `--disable-version-check` disable the version validation check.
+
+**Since: 2.2**
+
 # User Feedback

 ## Issues
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -461,7 +461,8 @@ future<> gossiper::apply_state_locally(std::map<inet_address, endpoint_state> ma
                    int local_generation = local_ep_state_ptr.get_heart_beat_state().get_generation();
                    int remote_generation = remote_state.get_heart_beat_state().get_generation();
                    logger.trace("{} local generation {}, remote generation {}", ep, local_generation, remote_generation);
-                    if (local_generation != 0 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
+                    // A node was removed with nodetool removenode can have a generation of 2
+                    if (local_generation > 2 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
                        // assume some peer has corrupted memory and is broadcasting an unbelievable generation about another peer (or itself)
                        logger.warn("received an invalid gossip generation for peer {}; local generation = {}, received generation = {}",
                            ep, local_generation, remote_generation);
@@ -832,6 +833,7 @@ int gossiper::get_max_endpoint_state_version(endpoint_state state) {

 // Runs inside seastar::async context
 void gossiper::evict_from_membership(inet_address endpoint) {
+    auto permit = lock_endpoint(endpoint).get0();
    _unreachable_endpoints.erase(endpoint);
    container().invoke_on_all([endpoint] (auto& g) {
        g.endpoint_state_map.erase(endpoint);
@@ -982,7 +984,7 @@ future<> gossiper::assassinate_endpoint(sstring address) {
            logger.warn("Assassinating {} via gossip", endpoint);
            if (es) {
                auto& ss = service::get_local_storage_service();
-                auto tokens = ss.get_token_metadata().get_tokens(endpoint);
+                tokens = ss.get_token_metadata().get_tokens(endpoint);
                if (tokens.empty()) {
                    logger.warn("Unable to calculate tokens for {}.  Will use a random one", address);
                    throw std::runtime_error(sprint("Unable to calculate tokens for %s", endpoint));
--- a/intrusive_set_external_comparator.hh
+++ b/intrusive_set_external_comparator.hh
@@ -105,6 +105,7 @@ private:
 public:
    intrusive_set_external_comparator() { algo::init_header(_header.this_ptr()); }
    intrusive_set_external_comparator(intrusive_set_external_comparator&& o) {
+        algo::init_header(_header.this_ptr());
        algo::swap_tree(_header.this_ptr(), node_ptr(o._header.this_ptr()));
    }
    iterator begin() { return iterator(algo::begin_node(_header.this_ptr()), priv_value_traits_ptr()); }
--- a/keys.hh
+++ b/keys.hh
@@ -733,6 +733,10 @@ public:
    static const compound& get_compound_type(const schema& s) {
        return s.clustering_key_prefix_type();
    }
+
+    static clustering_key_prefix_view make_empty() {
+        return { bytes_view() };
+    }
 };

 class clustering_key_prefix : public prefix_compound_wrapper<clustering_key_prefix, clustering_key_prefix_view, clustering_key> {
--- a/locator/ec2_multi_region_snitch.cc
+++ b/locator/ec2_multi_region_snitch.cc
@@ -100,7 +100,6 @@ future<> ec2_multi_region_snitch::gossiper_starting() {
    // Note: currently gossiper "main" instance always runs on CPU0 therefore
    // this function will be executed on CPU0 only.
    //
-    ec2_snitch::gossiper_starting();

    using namespace gms;
    auto& g = get_local_gossiper();
--- a/md5_hasher.hh
+++ b/md5_hasher.hh
@@ -31,6 +31,7 @@ class md5_hasher {
    CryptoPP::Weak::MD5 hash{};
 public:
    void update(const char* ptr, size_t length) {
+        using namespace CryptoPP;
        static_assert(sizeof(char) == sizeof(byte), "Assuming lengths will be the same");
        hash.Update(reinterpret_cast<const byte*>(ptr), length * sizeof(byte));
    }
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -1849,9 +1849,10 @@ void mutation_querier::query_static_row(const row& r, tombstone current_tombston
        } else if (_short_reads_allowed) {
            seastar::measuring_output_stream stream;
            ser::qr_partition__static_row__cells<seastar::measuring_output_stream> out(stream, { });
+            auto start = stream.size();
            get_compacted_row_slice(_schema, slice, column_kind::static_column,
-                                    r, slice.static_columns, _static_cells_wr);
-            _memory_accounter.update(stream.size());
+                                    r, slice.static_columns, out);
+            _memory_accounter.update(stream.size() - start);
        }
        if (_pw.requested_digest()) {
            ::feed_hash(_pw.digest(), current_tombstone);
@@ -1909,8 +1910,9 @@ stop_iteration mutation_querier::consume(clustering_row&& cr, row_tombstone curr
    } else if (_short_reads_allowed) {
        seastar::measuring_output_stream stream;
        ser::qr_partition__rows<seastar::measuring_output_stream> out(stream, { });
+        auto start = stream.size();
        write_row(out);
-        stop = _memory_accounter.update_and_check(stream.size());
+        stop = _memory_accounter.update_and_check(stream.size() - start);
    }

    _live_clustering_rows++;
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -27,7 +27,6 @@
 #include "core/future-util.hh"
 #include "utils/move.hh"
 #include "stdx.hh"
-#include "reader_resource_tracker.hh"
 #include "flat_mutation_reader.hh"


@@ -715,23 +714,55 @@ mutation_reader make_empty_reader() {
    return make_mutation_reader<empty_reader>();
 }

+const reader_concurrency_semaphore::timeout_clock::duration
+reader_concurrency_semaphore::no_timeout{reader_concurrency_semaphore::timeout_clock::duration::max()};
+
+void reader_concurrency_semaphore::signal(const resources& r) {
+    _resources += r;
+    while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
+        auto& x = _wait_list.front();
+        _resources -= x.res;
+        x.pr.set_value(make_lw_shared<reader_permit>(*this, x.res));
+        _wait_list.pop_front();
+    }
+}
+
+future<lw_shared_ptr<reader_concurrency_semaphore::reader_permit>> reader_concurrency_semaphore::wait_admission(size_t memory) {
+    if (_wait_list.size() >= _max_queue_length) {
+        return make_exception_future<lw_shared_ptr<reader_permit>>(_make_queue_overloaded_exception());
+    }
+    auto r = resources(1, static_cast<ssize_t>(memory));
+    if (may_proceed(r)) {
+        _resources -= r;
+        return make_ready_future<lw_shared_ptr<reader_permit>>(make_lw_shared<reader_permit>(*this, r));
+    }
+    promise<lw_shared_ptr<reader_permit>> pr;
+    auto fut = pr.get_future();
+    if (_timeout == no_timeout) {
+        _wait_list.push_back(entry(std::move(pr), r));
+    } else {
+        _wait_list.push_back(entry(std::move(pr), r), timeout_clock::now() + _timeout);
+    }
+    return fut;
+}
+
 // A file that tracks the memory usage of buffers resulting from read
 // operations.
 class tracking_file_impl : public file_impl {
    file _tracked_file;
-    semaphore* _semaphore;
+    lw_shared_ptr<reader_concurrency_semaphore::reader_permit> _permit;

    // Shouldn't be called if semaphore is NULL.
    temporary_buffer<uint8_t> make_tracked_buf(temporary_buffer<uint8_t> buf) {
        return seastar::temporary_buffer<uint8_t>(buf.get_write(),
                buf.size(),
-                make_deleter(buf.release(), std::bind(&semaphore::signal, _semaphore, buf.size())));
+                make_deleter(buf.release(), std::bind(&reader_concurrency_semaphore::reader_permit::signal_memory, _permit, buf.size())));
    }

 public:
    tracking_file_impl(file file, reader_resource_tracker resource_tracker)
        : _tracked_file(std::move(file))
-        , _semaphore(resource_tracker.get_semaphore()) {
+        , _permit(resource_tracker.get_permit()) {
    }

    tracking_file_impl(const tracking_file_impl&) = delete;
@@ -793,9 +824,9 @@ public:

    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override {
        return get_file_impl(_tracked_file)->dma_read_bulk(offset, range_size, pc).then([this] (temporary_buffer<uint8_t> buf) {
-            if (_semaphore) {
+            if (_permit) {
                buf = make_tracked_buf(std::move(buf));
-                _semaphore->consume(buf.size());
+                _permit->consume_memory(buf.size());
            }
            return make_ready_future<temporary_buffer<uint8_t>>(std::move(buf));
        });
@@ -819,33 +850,23 @@ class restricting_mutation_reader : public flat_mutation_reader::impl {
        streamed_mutation::forwarding _fwd;
        mutation_reader::forwarding _fwd_mr;

-        flat_mutation_reader operator()() {
-            return _ms.make_flat_mutation_reader(std::move(_s), _range.get(), _slice.get(), _pc.get(), std::move(_trace_state), _fwd, _fwd_mr);
+        flat_mutation_reader operator()(reader_resource_tracker tracker) {
+            return _ms.make_flat_mutation_reader(std::move(_s), _range.get(), _slice.get(), _pc.get(), std::move(_trace_state), _fwd, _fwd_mr, tracker);
        }
    };

-    const restricted_mutation_reader_config& _config;
-    boost::variant<mutation_source_and_params, flat_mutation_reader> _reader_or_mutation_source;
+    struct pending_state {
+        reader_concurrency_semaphore* semaphore;
+        mutation_source_and_params reader_factory;
+    };
+    struct admitted_state {
+        lw_shared_ptr<reader_concurrency_semaphore::reader_permit> permit;
+        flat_mutation_reader reader;
+    };
+    boost::variant<pending_state, admitted_state> _state;

    static const std::size_t new_reader_base_cost{16 * 1024};

-    future<> create_reader() {
-        auto f = _config.timeout.count() != 0
-                ? _config.resources_sem->wait(_config.timeout, new_reader_base_cost)
-                : _config.resources_sem->wait(new_reader_base_cost);
-
-        return f.then([this] {
-            flat_mutation_reader reader = boost::get<mutation_source_and_params>(_reader_or_mutation_source)();
-            _reader_or_mutation_source = std::move(reader);
-
-            if (_config.active_reads) {
-                ++(*_config.active_reads);
-            }
-
-            return make_ready_future<>();
-        });
-    }
-
    template<typename Function>
    GCC6_CONCEPT(
        requires std::is_move_constructible<Function>::value
@@ -854,15 +875,19 @@ class restricting_mutation_reader : public flat_mutation_reader::impl {
            }
    )
    decltype(auto) with_reader(Function fn) {
-        if (auto* reader = boost::get<flat_mutation_reader>(&_reader_or_mutation_source)) {
-            return fn(*reader);
+        if (auto* state = boost::get<admitted_state>(&_state)) {
+            return fn(state->reader);
        }
-        return create_reader().then([this, fn = std::move(fn)] () mutable {
-            return fn(boost::get<flat_mutation_reader>(_reader_or_mutation_source));
+
+        return boost::get<pending_state>(_state).semaphore->wait_admission(new_reader_base_cost).then(
+                [this, fn = std::move(fn)] (lw_shared_ptr<reader_concurrency_semaphore::reader_permit> permit) mutable {
+            auto reader_factory = std::move(boost::get<pending_state>(_state).reader_factory);
+            _state = admitted_state{permit, reader_factory(reader_resource_tracker(permit))};
+            return fn(boost::get<admitted_state>(_state).reader);
        });
    }
 public:
-    restricting_mutation_reader(const restricted_mutation_reader_config& config,
+    restricting_mutation_reader(reader_concurrency_semaphore& semaphore,
            mutation_source ms,
            schema_ptr s,
            const dht::partition_range& range,
@@ -872,20 +897,8 @@ public:
            streamed_mutation::forwarding fwd,
            mutation_reader::forwarding fwd_mr)
        : impl(s)
-        , _config(config)
-        , _reader_or_mutation_source(
-                mutation_source_and_params{std::move(ms), std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr}) {
-        if (_config.resources_sem->waiters() >= _config.max_queue_length) {
-            _config.raise_queue_overloaded_exception();
-        }
-    }
-    ~restricting_mutation_reader() {
-        if (boost::get<flat_mutation_reader>(&_reader_or_mutation_source)) {
-            _config.resources_sem->signal(new_reader_base_cost);
-            if (_config.active_reads) {
-                --(*_config.active_reads);
-            }
-        }
+        , _state(pending_state{&semaphore,
+                mutation_source_and_params{std::move(ms), std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr}}) {
    }

    virtual future<> fill_buffer() override {
@@ -904,8 +917,8 @@ public:
            return;
        }
        _end_of_stream = false;
-        if (auto* reader = boost::get<flat_mutation_reader>(&_reader_or_mutation_source)) {
-            return reader->next_partition();
+        if (auto* state = boost::get<admitted_state>(&_state)) {
+            return state->reader.next_partition();
        }
    }
    virtual future<> fast_forward_to(const dht::partition_range& pr) override {
@@ -925,7 +938,7 @@ public:
 };

 flat_mutation_reader
-make_restricted_flat_reader(const restricted_mutation_reader_config& config,
+make_restricted_flat_reader(reader_concurrency_semaphore& semaphore,
                       mutation_source ms,
                       schema_ptr s,
                       const dht::partition_range& range,
@@ -934,7 +947,7 @@ make_restricted_flat_reader(const restricted_mutation_reader_config& config,
                       tracing::trace_state_ptr trace_state,
                       streamed_mutation::forwarding fwd,
                       mutation_reader::forwarding fwd_mr) {
-    return make_flat_mutation_reader<restricting_mutation_reader>(config, std::move(ms), std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+    return make_flat_mutation_reader<restricting_mutation_reader>(semaphore, std::move(ms), std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
 }


--- a/mutation_reader.hh
+++ b/mutation_reader.hh
@@ -30,6 +30,7 @@
 #include "core/do_with.hh"
 #include "tracing/trace_state.hh"
 #include "flat_mutation_reader.hh"
+#include "reader_concurrency_semaphore.hh"

 // A mutation_reader is an object which allows iterating on mutations: invoke
 // the function to get a future for the next mutation, with an unset optional
@@ -275,7 +276,8 @@ class mutation_source {
        io_priority,
        tracing::trace_state_ptr,
        streamed_mutation::forwarding,
-        mutation_reader::forwarding
+        mutation_reader::forwarding,
+        reader_resource_tracker
    )>;
    using flat_reader_factory_type = std::function<flat_mutation_reader(schema_ptr,
                                                                        partition_range,
@@ -283,7 +285,8 @@ class mutation_source {
                                                                        io_priority,
                                                                        tracing::trace_state_ptr,
                                                                        streamed_mutation::forwarding,
-                                                                        mutation_reader::forwarding)>;
+                                                                        mutation_reader::forwarding,
+                                                                        reader_resource_tracker)>;
    class impl {
    public:
        virtual ~impl() { }
@@ -293,14 +296,16 @@ class mutation_source {
                                                     io_priority pc,
                                                     tracing::trace_state_ptr trace_state,
                                                     streamed_mutation::forwarding fwd,
-                                                     mutation_reader::forwarding fwd_mr) = 0;
+                                                     mutation_reader::forwarding fwd_mr,
+                                                     reader_resource_tracker tracker) = 0;
        virtual flat_mutation_reader make_flat_mutation_reader(schema_ptr s,
                                                               partition_range range,
                                                               const query::partition_slice& slice,
                                                               io_priority pc,
                                                               tracing::trace_state_ptr trace_state,
                                                               streamed_mutation::forwarding fwd,
-                                                               mutation_reader::forwarding fwd_mr) = 0;
+                                                               mutation_reader::forwarding fwd_mr,
+                                                               reader_resource_tracker tracker) = 0;
    };
    class mutation_reader_mutation_source : public impl {
        func_type _fn;
@@ -312,8 +317,9 @@ class mutation_source {
                                                     io_priority pc,
                                                     tracing::trace_state_ptr trace_state,
                                                     streamed_mutation::forwarding fwd,
-                                                     mutation_reader::forwarding fwd_mr) override {
-            return _fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+                                                     mutation_reader::forwarding fwd_mr,
+                                                     reader_resource_tracker tracker) override {
+            return _fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker);
        }
        virtual flat_mutation_reader make_flat_mutation_reader(schema_ptr s,
                                                               partition_range range,
@@ -321,9 +327,10 @@ class mutation_source {
                                                               io_priority pc,
                                                               tracing::trace_state_ptr trace_state,
                                                               streamed_mutation::forwarding fwd,
-                                                               mutation_reader::forwarding fwd_mr) override {
+                                                               mutation_reader::forwarding fwd_mr,
+                                                               reader_resource_tracker tracker) override {
            return flat_mutation_reader_from_mutation_reader(s,
-                                                             _fn(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr),
+                                                             _fn(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker),
                                                             fwd);
        }
    };
@@ -337,8 +344,9 @@ class mutation_source {
                                                     io_priority pc,
                                                     tracing::trace_state_ptr trace_state,
                                                     streamed_mutation::forwarding fwd,
-                                                     mutation_reader::forwarding fwd_mr) override {
-            return mutation_reader_from_flat_mutation_reader(_fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr));
+                                                     mutation_reader::forwarding fwd_mr,
+                                                     reader_resource_tracker tracker) override {
+            return mutation_reader_from_flat_mutation_reader(_fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker));
        }
        virtual flat_mutation_reader make_flat_mutation_reader(schema_ptr s,
                                                               partition_range range,
@@ -346,8 +354,9 @@ class mutation_source {
                                                               io_priority pc,
                                                               tracing::trace_state_ptr trace_state,
                                                               streamed_mutation::forwarding fwd,
-                                                               mutation_reader::forwarding fwd_mr) override {
-            return _fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+                                                               mutation_reader::forwarding fwd_mr,
+                                                               reader_resource_tracker tracker) override {
+            return _fn(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker);
        }
    };
    // We could have our own version of std::function<> that is nothrow
@@ -368,23 +377,78 @@ public:
        : _impl(seastar::make_shared<flat_mutation_reader_mutation_source>(std::move(fn)))
        , _presence_checker_factory(make_lw_shared(std::move(pcf)))
    { }
+
+    mutation_source(std::function<flat_mutation_reader(schema_ptr, partition_range, const query::partition_slice&, io_priority,
+                tracing::trace_state_ptr, streamed_mutation::forwarding, mutation_reader::forwarding)> fn,
+            std::function<partition_presence_checker()> pcf = [] { return make_default_partition_presence_checker(); })
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice& slice,
+                    io_priority pc,
+                    tracing::trace_state_ptr tr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding fwd_mr,
+                    reader_resource_tracker) {
+            return fn(s, range, slice, pc, std::move(tr), fwd, fwd_mr);
+        }
+        , std::move(pcf)) {}
+    mutation_source(std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&, io_priority,
+                tracing::trace_state_ptr, streamed_mutation::forwarding, mutation_reader::forwarding)> fn)
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice& slice,
+                    io_priority pc,
+                    tracing::trace_state_ptr tr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding fwd_mr,
+                    reader_resource_tracker) {
+            return fn(s, range, slice, pc, std::move(tr), fwd, fwd_mr);
+        }) {}
    // For sources which don't care about the mutation_reader::forwarding flag (always fast forwardable)
    mutation_source(std::function<mutation_reader(schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr, streamed_mutation::forwarding)> fn)
-        : mutation_source([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr tr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice& slice,
+                    io_priority pc,
+                    tracing::trace_state_ptr tr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding,
+                    reader_resource_tracker) {
            return fn(s, range, slice, pc, std::move(tr), fwd);
        }) {}
    mutation_source(std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&, io_priority)> fn)
-        : mutation_source([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice& slice,
+                    io_priority pc,
+                    tracing::trace_state_ptr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding,
+                    reader_resource_tracker) {
            assert(!fwd);
            return fn(s, range, slice, pc);
        }) {}
    mutation_source(std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&)> fn)
-        : mutation_source([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice& slice,
+                    io_priority,
+                    tracing::trace_state_ptr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding,
+                    reader_resource_tracker) {
            assert(!fwd);
            return fn(s, range, slice);
        }) {}
    mutation_source(std::function<mutation_reader(schema_ptr, partition_range range)> fn)
-        : mutation_source([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice&, io_priority, tracing::trace_state_ptr, streamed_mutation::forwarding fwd, mutation_reader::forwarding) {
+        : mutation_source([fn = std::move(fn)] (schema_ptr s,
+                    partition_range range,
+                    const query::partition_slice&,
+                    io_priority,
+                    tracing::trace_state_ptr,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding,
+                    reader_resource_tracker) {
            assert(!fwd);
            return fn(s, range);
        }) {}
@@ -404,9 +468,10 @@ public:
        io_priority pc = default_priority_class(),
        tracing::trace_state_ptr trace_state = nullptr,
        streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
-        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const
+        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes,
+        reader_resource_tracker tracker = no_resource_tracking()) const
    {
-        return _impl->make_mutation_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+        return _impl->make_mutation_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker);
    }

    mutation_reader operator()(schema_ptr s, partition_range range = query::full_partition_range) const {
@@ -422,9 +487,10 @@ public:
        io_priority pc = default_priority_class(),
        tracing::trace_state_ptr trace_state = nullptr,
        streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
-        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const
+        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes,
+        reader_resource_tracker tracker = no_resource_tracking()) const
    {
-        return _impl->make_flat_mutation_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+        return _impl->make_flat_mutation_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd, fwd_mr, tracker);
    }

    flat_mutation_reader
@@ -467,18 +533,6 @@ public:
 mutation_source make_empty_mutation_source();
 snapshot_source make_empty_snapshot_source();

-struct restricted_mutation_reader_config {
-    semaphore* resources_sem = nullptr;
-    uint64_t* active_reads = nullptr;
-    std::chrono::nanoseconds timeout = {};
-    size_t max_queue_length = std::numeric_limits<size_t>::max();
-    std::function<void ()> raise_queue_overloaded_exception = default_raise_queue_overloaded_exception;
-
-    static void default_raise_queue_overloaded_exception() {
-        throw std::runtime_error("restricted mutation reader queue overload");
-    }
-};
-
 // Creates a restricted reader whose resource usages will be tracked
 // during it's lifetime. If there are not enough resources (dues to
 // existing readers) to create the new reader, it's construction will
@@ -488,7 +542,7 @@ struct restricted_mutation_reader_config {
 // a semaphore to track and limit the memory usage of readers. It also
 // contains a timeout and a maximum queue size for inactive readers
 // whose construction is blocked.
-flat_mutation_reader make_restricted_flat_reader(const restricted_mutation_reader_config& config,
+flat_mutation_reader make_restricted_flat_reader(reader_concurrency_semaphore& semaphore,
        mutation_source ms,
        schema_ptr s,
        const dht::partition_range& range,
@@ -498,12 +552,12 @@ flat_mutation_reader make_restricted_flat_reader(const restricted_mutation_reade
        streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes);

-inline flat_mutation_reader make_restricted_flat_reader(const restricted_mutation_reader_config& config,
+inline flat_mutation_reader make_restricted_flat_reader(reader_concurrency_semaphore& semaphore,
                                              mutation_source ms,
                                              schema_ptr s,
                                              const dht::partition_range& range = query::full_partition_range) {
    auto& full_slice = s->full_slice();
-    return make_restricted_flat_reader(config, std::move(ms), std::move(s), range, full_slice);
+    return make_restricted_flat_reader(semaphore, std::move(ms), std::move(s), range, full_slice);
 }

 template<>
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -259,6 +259,11 @@ public:
        return is_partition_end() || (_ck && _ck->is_empty(s) && _bound_weight > 0);
    }

+    bool is_before_all_clustered_rows(const schema& s) const {
+        return _type < partition_region::clustered
+               || (_type == partition_region::clustered && _ck->is_empty(s) && _bound_weight < 0);
+    }
+
    template<typename Hasher>
    void feed_hash(Hasher& hasher, const schema& s) const {
        ::feed_hash(hasher, _bound_weight);
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -0,0 +1,204 @@
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+#pragma once
+
+#include <core/file.hh>
+#include <core/semaphore.hh>
+
+/// Specific semaphore for controlling reader concurrency
+///
+/// Before creating a reader one should obtain a permit by calling
+/// `wait_admission()`. This permit can then be used for tracking the
+/// reader's memory consumption via `reader_resource_tracker`.
+/// The permit should be held onto for the lifetime of the reader
+/// and/or any buffer its tracking.
+/// Reader concurrency is dual limited by count and memory.
+/// The semaphore can be configured with the desired limits on
+/// construction. New readers will only be admitted when there is both
+/// enough count and memory units available. Readers are admitted in
+/// FIFO order.
+/// It's possible to specify the maximum allowed number of waiting
+/// readers by the `max_queue_length` constructor parameter. When the
+/// number waiting readers would be equal or greater than this number
+/// (when calling `wait_admission()`) an exception will be thrown.
+/// The type of the exception and optionally some additional code
+/// that should be executed when this happens can be customized by the
+/// `raise_queue_overloaded_exception` constructor parameter. This
+/// function will be called every time the queue limit is surpassed.
+/// It is expected to return an `std::exception_ptr` that will be
+/// injected into the future.
+class reader_concurrency_semaphore {
+public:
+    using timeout_clock = lowres_clock;
+    static const timeout_clock::duration no_timeout;
+
+    struct resources {
+        int count = 0;
+        ssize_t memory = 0;
+
+        resources() = default;
+
+        resources(int count, ssize_t memory)
+            : count(count)
+            , memory(memory) {
+        }
+
+        bool operator>=(const resources& other) const {
+            return count >= other.count && memory >= other.memory;
+        }
+
+        resources& operator-=(const resources& other) {
+            count -= other.count;
+            memory -= other.memory;
+            return *this;
+        }
+
+        resources& operator+=(const resources& other) {
+            count += other.count;
+            memory += other.memory;
+            return *this;
+        }
+
+        explicit operator bool() const {
+            return count >= 0 && memory >= 0;
+        }
+    };
+
+    class reader_permit {
+        reader_concurrency_semaphore& _semaphore;
+        const resources _base_cost;
+    public:
+        reader_permit(reader_concurrency_semaphore& semaphore, resources base_cost)
+            : _semaphore(semaphore)
+            , _base_cost(base_cost) {
+        }
+
+        ~reader_permit() {
+            _semaphore.signal(_base_cost);
+        }
+
+        reader_permit(const reader_permit&) = delete;
+        reader_permit& operator=(const reader_permit&) = delete;
+
+        reader_permit(reader_permit&& other) = delete;
+        reader_permit& operator=(reader_permit&& other) = delete;
+
+        void consume_memory(size_t memory) {
+            _semaphore.consume_memory(memory);
+        }
+
+        void signal_memory(size_t memory) {
+            _semaphore.signal_memory(memory);
+        }
+    };
+
+private:
+    static std::exception_ptr default_make_queue_overloaded_exception() {
+        return std::make_exception_ptr(std::runtime_error("restricted mutation reader queue overload"));
+    }
+
+    resources _resources;
+
+    struct entry {
+        promise<lw_shared_ptr<reader_permit>> pr;
+        resources res;
+        entry(promise<lw_shared_ptr<reader_permit>>&& pr, resources r) : pr(std::move(pr)), res(r) {}
+    };
+    struct expiry_handler {
+        void operator()(entry& e) noexcept {
+            e.pr.set_exception(semaphore_timed_out());
+        }
+    };
+    expiring_fifo<entry, expiry_handler, timeout_clock> _wait_list;
+
+    timeout_clock::duration _timeout;
+    size_t _max_queue_length = std::numeric_limits<size_t>::max();
+    std::function<std::exception_ptr()> _make_queue_overloaded_exception = default_make_queue_overloaded_exception;
+
+    bool has_available_units(const resources& r) const {
+        return bool(_resources) && _resources >= r;
+    }
+
+    bool may_proceed(const resources& r) const {
+        return has_available_units(r) && _wait_list.empty();
+    }
+
+    void consume_memory(size_t memory) {
+        _resources.memory -= memory;
+    }
+
+    void signal(const resources& r);
+
+    void signal_memory(size_t memory) {
+        signal(resources(0, static_cast<ssize_t>(memory)));
+    }
+public:
+    reader_concurrency_semaphore(unsigned count,
+            size_t memory,
+            timeout_clock::duration timeout = no_timeout,
+            size_t max_queue_length = std::numeric_limits<size_t>::max(),
+            std::function<std::exception_ptr()> raise_queue_overloaded_exception = default_make_queue_overloaded_exception)
+        : _resources(count, memory)
+        , _timeout(timeout)
+        , _max_queue_length(max_queue_length)
+        , _make_queue_overloaded_exception(raise_queue_overloaded_exception) {
+    }
+
+    reader_concurrency_semaphore(const reader_concurrency_semaphore&) = delete;
+    reader_concurrency_semaphore& operator=(const reader_concurrency_semaphore&) = delete;
+
+    reader_concurrency_semaphore(reader_concurrency_semaphore&&) = delete;
+    reader_concurrency_semaphore& operator=(reader_concurrency_semaphore&&) = delete;
+
+    future<lw_shared_ptr<reader_permit>> wait_admission(size_t memory);
+
+    const resources available_resources() const {
+        return _resources;
+    }
+
+    size_t waiters() const {
+        return _wait_list.size();
+    }
+};
+
+class reader_resource_tracker {
+    lw_shared_ptr<reader_concurrency_semaphore::reader_permit> _permit;
+public:
+    reader_resource_tracker() = default;
+    explicit reader_resource_tracker(lw_shared_ptr<reader_concurrency_semaphore::reader_permit> permit)
+        : _permit(std::move(permit)) {
+    }
+
+    bool operator==(const reader_resource_tracker& other) const {
+        return _permit == other._permit;
+    }
+
+    file track(file f) const;
+
+    lw_shared_ptr<reader_concurrency_semaphore::reader_permit> get_permit() const {
+        return _permit;
+    }
+};
+
+inline reader_resource_tracker no_resource_tracking() {
+    return {};
+}
--- a/reader_resource_tracker.hh
+++ b/reader_resource_tracker.hh
@@ -1,48 +0,0 @@
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Copyright (C) 2017 ScyllaDB
- */
-
-#pragma once
-
-#include <core/file.hh>
-#include <core/semaphore.hh>
-
-class reader_resource_tracker {
-    seastar::semaphore* _sem = nullptr;
-public:
-    reader_resource_tracker() = default;
-    explicit reader_resource_tracker(seastar::semaphore* sem)
-        : _sem(sem) {
-    }
-
-    bool operator==(const reader_resource_tracker& other) const {
-        return _sem == other._sem;
-    }
-
-    file track(file f) const;
-
-    semaphore* get_semaphore() const {
-        return _sem;
-    }
-};
-
-inline reader_resource_tracker no_resource_tracking() {
-    return reader_resource_tracker(nullptr);
-}
--- a/2
+++ b/2
@@ -87,7 +87,7 @@ def get_repo_file(dir):
    for name in files:
        with open(name, 'r') as myfile:
            for line in myfile:
-                match = re.search(".*http.?://.*/scylladb/([^/\s]+)/deb/([^/\s]+)\s.*", line)
+                match = re.search(".*http.?://.*/scylladb/([^/\s]+)/deb/([^/\s]+)[\s/].*", line)
                if match:
                    return match.group(2), match.group(1)
                match = re.search(".*http.?://.*/scylladb/([^/]+)/rpm/[^/]+/([^/\s]+)/.*", line)
--- a/2
+++ b/2
--- a/service/misc_services.cc
+++ b/service/misc_services.cc
@@ -144,7 +144,11 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
        return _db.invoke_on_all([this, rates = std::move(rates), cpuid = engine().cpu_id()] (database& db) {
            sstring gstate;
            for (auto& cf : db.get_column_families() | boost::adaptors::filtered(non_system_filter)) {
-                stat s = rates.at(cf.first);
+                auto it = rates.find(cf.first);
+                if (it == rates.end()) { // a table may be added before map/reduce compltes and this code runs
+                    continue;
+                }
+                stat s = it->second;
                float rate = 0;
                if (s.h) {
                    rate = s.h / (s.h + s.m);
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -1011,6 +1011,7 @@ void storage_service::on_change(inet_address endpoint, application_state state,
        boost::split(pieces, value.value, boost::is_any_of(sstring(versioned_value::DELIMITER_STR)));
        if (pieces.empty()) {
            slogger.warn("Fail to split status in on_change: endpoint={}, app_state={}, value={}", endpoint, state, value);
+            return;
        }
        sstring move_name = pieces[0];
        if (move_name == sstring(versioned_value::STATUS_BOOTSTRAPPING)) {
--- a/sstables/row.hh
+++ b/sstables/row.hh
@@ -27,7 +27,7 @@
 #include "core/temporary_buffer.hh"
 #include "consumer.hh"
 #include "sstables/types.hh"
-#include "reader_resource_tracker.hh"
+#include "reader_concurrency_semaphore.hh"

 // sstables::data_consume_row feeds the contents of a single row into a
 // row_consumer object:
--- a/streaming/stream_session.hh
+++ b/streaming/stream_session.hh
@@ -175,10 +175,10 @@ private:
    bool _complete_sent = false;
    bool _received_failed_complete_message = false;

-    // If the session is idle for 300 minutes, close the session
-    std::chrono::seconds _keep_alive_timeout{60 * 300};
-    // Check every 10 minutes
-    std::chrono::seconds _keep_alive_interval{60 * 10};
+    // If the session is idle for 10 minutes, close the session
+    std::chrono::seconds _keep_alive_timeout{60 * 10};
+    // Check every 1 minutes
+    std::chrono::seconds _keep_alive_interval{60};
    timer<lowres_clock> _keep_alive;
    stream_bytes _last_stream_bytes;
    lowres_clock::time_point _last_stream_progress;
--- a/test.py
+++ b/test.py
@@ -148,9 +148,9 @@ if __name__ == "__main__":
    for mode in modes_to_run:
        prefix = os.path.join('build', mode, 'tests')
        for test in other_tests:
-            test_to_run.append((os.path.join(prefix, test), 'other'))
+            test_to_run.append((os.path.join(prefix, test), 'other', '-c2 -m4G'.split()))
        for test in boost_tests:
-            test_to_run.append((os.path.join(prefix, test), 'boost'))
+            test_to_run.append((os.path.join(prefix, test), 'boost', '-c2 -m4G'.split()))

    if 'release' in modes_to_run:
        test_to_run.append(('build/release/tests/lsa_async_eviction_test', 'other',
--- a/tests/cql_query_test.cc
+++ b/tests/cql_query_test.cc
@@ -1395,14 +1395,14 @@ SEASTAR_TEST_CASE(test_ttl) {
                {{"p1", utf8_type}}, {}, {{"r1", utf8_type}, {"r2", utf8_type}, {"r3", make_my_list_type()}}, {}, utf8_type);
        }).then([&e] {
            return e.execute_cql(
-                "update cf using ttl 1000 set r1 = 'value1_1', r3 = ['a', 'b', 'c'] where p1 = 'key1';").discard_result();
+                "update cf using ttl 100000 set r1 = 'value1_1', r3 = ['a', 'b', 'c'] where p1 = 'key1';").discard_result();
        }).then([&e] {
            return e.execute_cql(
-                "update cf using ttl 1 set r1 = 'value1_3', r3 = ['a', 'b', 'c'] where p1 = 'key3';").discard_result();
+                "update cf using ttl 100 set r1 = 'value1_3', r3 = ['a', 'b', 'c'] where p1 = 'key3';").discard_result();
        }).then([&e] {
-            return e.execute_cql("update cf using ttl 1 set r3[1] = 'b' where p1 = 'key1';").discard_result();
+            return e.execute_cql("update cf using ttl 100 set r3[1] = 'b' where p1 = 'key1';").discard_result();
        }).then([&e] {
-            return e.execute_cql("update cf using ttl 1 set r1 = 'value1_2' where p1 = 'key2';").discard_result();
+            return e.execute_cql("update cf using ttl 100 set r1 = 'value1_2' where p1 = 'key2';").discard_result();
        }).then([&e] {
            return e.execute_cql("insert into cf (p1, r2) values ('key2', 'value2_2');").discard_result();
        }).then([&e, my_list_type] {
@@ -1420,7 +1420,7 @@ SEASTAR_TEST_CASE(test_ttl) {
                });
            });
        }).then([&e] {
-            forward_jump_clocks(2s);
+            forward_jump_clocks(200s);
            return e.execute_cql("select r1, r2 from cf;").then([](auto msg) {
                assert_that(msg).is_rows().with_size(2)
                    .with_row({{}, utf8_type->decompose(sstring("value2_2"))})
@@ -1448,7 +1448,7 @@ SEASTAR_TEST_CASE(test_ttl) {
        }).then([&e] {
            return e.execute_cql("create table cf2 (p1 text PRIMARY KEY, r1 text, r2 text);").discard_result();
        }).then([&e] {
-            return e.execute_cql("insert into cf2 (p1, r1) values ('foo', 'bar') using ttl 5;").discard_result();
+            return e.execute_cql("insert into cf2 (p1, r1) values ('foo', 'bar') using ttl 500;").discard_result();
        }).then([&e] {
            return e.execute_cql("select p1, r1 from cf2 where p1 = 'foo';").then([] (auto msg) {
                assert_that(msg).is_rows().with_rows({
@@ -1456,7 +1456,7 @@ SEASTAR_TEST_CASE(test_ttl) {
                });
            });
        }).then([&e] {
-            forward_jump_clocks(6s);
+            forward_jump_clocks(600s);
            return e.execute_cql("select p1, r1 from cf2 where p1 = 'foo';").then([] (auto msg) {
                assert_that(msg).is_rows().with_rows({ });
            });
@@ -1471,7 +1471,7 @@ SEASTAR_TEST_CASE(test_ttl) {
                });
            });
        }).then([&e] {
-            return e.execute_cql("insert into cf2 (p1, r1) values ('foo', 'bar') using ttl 5;").discard_result();
+            return e.execute_cql("insert into cf2 (p1, r1) values ('foo', 'bar') using ttl 500;").discard_result();
        }).then([&e] {
            return e.execute_cql("update cf2 set r1 = null where p1 = 'foo';").discard_result();
        }).then([&e] {
@@ -1481,16 +1481,16 @@ SEASTAR_TEST_CASE(test_ttl) {
                });
            });
        }).then([&e] {
-            forward_jump_clocks(6s);
+            forward_jump_clocks(600s);
            return e.execute_cql("select p1, r1 from cf2 where p1 = 'foo';").then([] (auto msg) {
                assert_that(msg).is_rows().with_rows({ });
            });
        }).then([&e] {
-            return e.execute_cql("insert into cf2 (p1, r1) values ('foo', 'bar') using ttl 5;").discard_result();
+            return e.execute_cql("insert into cf2 (p1, r1) values ('foo', 'bar') using ttl 500;").discard_result();
        }).then([&e] {
            return e.execute_cql("insert into cf2 (p1, r2) values ('foo', null);").discard_result();
        }).then([&e] {
-            forward_jump_clocks(6s);
+            forward_jump_clocks(600s);
            return e.execute_cql("select p1, r1 from cf2 where p1 = 'foo';").then([] (auto msg) {
                assert_that(msg).is_rows().with_rows({
                    {utf8_type->decompose(sstring("foo")), { }}
@@ -1987,10 +1987,9 @@ SEASTAR_TEST_CASE(test_in_restriction) {
            assert_that(msg).is_rows().with_size(0);
            return e.execute_cql("select r1 from tir where p1 in (2, 0, 2, 1);");
        }).then([&e] (auto msg) {
-            assert_that(msg).is_rows().with_rows({
+            assert_that(msg).is_rows().with_rows_ignore_order({
                {int32_type->decompose(4)},
                {int32_type->decompose(0)},
-                {int32_type->decompose(4)},
                {int32_type->decompose(1)},
                {int32_type->decompose(2)},
                {int32_type->decompose(3)},
@@ -2012,6 +2011,22 @@ SEASTAR_TEST_CASE(test_in_restriction) {
                {int32_type->decompose(2)},
                {int32_type->decompose(1)},
            });
+            return e.prepare("select r1 from tir where p1 in ?");
+        }).then([&e] (cql3::prepared_cache_key_type prepared_id){
+            auto my_list_type = list_type_impl::get_instance(int32_type, true);
+            std::vector<cql3::raw_value> raw_values;
+            auto in_values_list = my_list_type->decompose(make_list_value(my_list_type,
+                    list_type_impl::native_type{{int(2), int(0), int(2), int(1)}}));
+            raw_values.emplace_back(cql3::raw_value::make_value(in_values_list));
+            return e.execute_prepared(prepared_id,raw_values);
+        }).then([&e] (shared_ptr<cql_transport::messages::result_message> msg) {
+            assert_that(msg).is_rows().with_rows_ignore_order({
+                {int32_type->decompose(4)},
+                {int32_type->decompose(0)},
+                {int32_type->decompose(1)},
+                {int32_type->decompose(2)},
+                {int32_type->decompose(3)},
+            });
        });
    });
 }
@@ -2479,3 +2494,66 @@ SEASTAR_TEST_CASE(test_secondary_index_query) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_static_multi_cell_static_lists_with_ckey) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+        e.execute_cql("CREATE TABLE t (p int, c int, slist list<int> static, v int, PRIMARY KEY (p, c));").get();
+        e.execute_cql("INSERT INTO t (p, c, slist, v) VALUES (1, 1, [1], 1); ").get();
+
+        {
+            e.execute_cql("UPDATE t SET slist[0] = 3, v = 3 WHERE p = 1 AND c = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({{3}}))) },
+                { int32_type->decompose(3) }
+            });
+        }
+        {
+            e.execute_cql("UPDATE t SET slist = [4], v = 4 WHERE p = 1 AND c = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({{4}}))) },
+                { int32_type->decompose(4) }
+            });
+        }
+        {
+            e.execute_cql("UPDATE t SET slist = [3] + slist , v = 5 WHERE p = 1 AND c = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({3, 4}))) },
+                { int32_type->decompose(5) }
+            });
+        }
+        {
+            e.execute_cql("UPDATE t SET slist = slist + [5] , v = 6 WHERE p = 1 AND c = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({3, 4, 5}))) },
+                { int32_type->decompose(6) }
+            });
+        }
+        {
+            e.execute_cql("DELETE slist[2] from t WHERE p = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({3, 4}))) },
+                { int32_type->decompose(6) }
+            });
+        }
+        {
+            e.execute_cql("UPDATE t SET slist = slist - [4] , v = 7 WHERE p = 1 AND c = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({3}))) },
+                { int32_type->decompose(7) }
+            });
+        }
+    });
+}
+
--- a/tests/database_test.cc
+++ b/tests/database_test.cc
@@ -29,6 +29,9 @@
 #include "database.hh"
 #include "partition_slice_builder.hh"
 #include "frozen_mutation.hh"
+#include "mutation_source_test.hh"
+#include "schema_registry.hh"
+#include "service/migration_manager.hh"

 #include "disk-error-handler.hh"

@@ -79,3 +82,33 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+        run_mutation_source_tests([&] (schema_ptr s, const std::vector<mutation>& partitions) -> mutation_source {
+            try {
+                e.local_db().find_column_family(s->ks_name(), s->cf_name());
+                service::get_local_migration_manager().announce_column_family_drop(s->ks_name(), s->cf_name(), true).get();
+            } catch (const no_such_column_family&) {
+                // expected
+            }
+            service::get_local_migration_manager().announce_new_column_family(s, true).get();
+            column_family& cf = e.local_db().find_column_family(s);
+            for (auto&& m : partitions) {
+                e.local_db().apply(cf.schema(), freeze(m)).get();
+            }
+            cf.flush().get();
+            cf.get_row_cache().invalidate([] {}).get();
+            return mutation_source([&] (schema_ptr s,
+                    const dht::partition_range& range,
+                    const query::partition_slice& slice,
+                    const io_priority_class& pc,
+                    tracing::trace_state_ptr trace_state,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding fwd_mr) {
+                return cf.make_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+            });
+        });
+        return make_ready_future<>();
+    });
+}
--- a/tests/mutation_query_test.cc
+++ b/tests/mutation_query_test.cc
@@ -26,11 +26,13 @@

 #include <boost/test/unit_test.hpp>
 #include <query-result-set.hh>
+#include <query-result-writer.hh>

 #include "tests/test_services.hh"
 #include "tests/test-utils.hh"
 #include "tests/mutation_assertions.hh"
 #include "tests/result_set_assertions.hh"
+#include "tests/mutation_source_test.hh"

 #include "mutation_query.hh"
 #include "core/do_with.hh"
@@ -530,3 +532,22 @@ SEASTAR_TEST_CASE(test_partition_limit) {
        }
    });
 }
+
+SEASTAR_THREAD_TEST_CASE(test_result_size_calculation) {
+    random_mutation_generator gen(random_mutation_generator::generate_counters::no);
+    std::vector<mutation> mutations = gen(1);
+    schema_ptr s = gen.schema();
+    mutation_source source = make_source(std::move(mutations));
+    query::result_memory_limiter l(std::numeric_limits<ssize_t>::max());
+    query::partition_slice slice = make_full_slice(*s);
+    slice.options.set<query::partition_slice::option::allow_short_read>();
+
+    query::result::builder digest_only_builder(slice, query::result_options{query::result_request::only_digest, query::digest_algorithm::xxHash}, l.new_digest_read(query::result_memory_limiter::maximum_result_size).get0());
+    data_query(s, source, query::full_partition_range, slice, std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max(), gc_clock::now(), digest_only_builder).get0();
+
+    query::result::builder result_and_digest_builder(slice, query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash}, l.new_data_read(query::result_memory_limiter::maximum_result_size).get0());
+    data_query(s, source, query::full_partition_range, slice, std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max(), gc_clock::now(), result_and_digest_builder).get0();
+
+    BOOST_REQUIRE_EQUAL(digest_only_builder.memory_accounter().used_memory(), result_and_digest_builder.memory_accounter().used_memory());
+}
+
--- a/tests/mutation_reader_test.cc
+++ b/tests/mutation_reader_test.cc
@@ -753,19 +753,18 @@ class tracking_reader : public flat_mutation_reader::impl {
    std::size_t _call_count{0};
    std::size_t _ff_count{0};
 public:
-    tracking_reader(semaphore* resources_sem, schema_ptr schema, lw_shared_ptr<sstables::sstable> sst)
+    tracking_reader(schema_ptr schema, lw_shared_ptr<sstables::sstable> sst, reader_resource_tracker tracker)
        : impl(schema)
        , _reader(sst->read_range_rows_flat(
                        schema,
                        query::full_partition_range,
                        schema->full_slice(),
                        default_priority_class(),
-                        reader_resource_tracker(resources_sem),
+                        tracker,
                        streamed_mutation::forwarding::no,
                        mutation_reader::forwarding::yes)) {
    }

-
    virtual future<> fill_buffer() override {
        ++_call_count;
        return _reader.fill_buffer().then([this] {
@@ -811,16 +810,25 @@ class reader_wrapper {

 public:
    reader_wrapper(
-            const restricted_mutation_reader_config& config,
+            reader_concurrency_semaphore& semaphore,
            schema_ptr schema,
-            lw_shared_ptr<sstables::sstable> sst) : _reader(make_empty_flat_reader(schema)) {
-        auto ms = mutation_source([this, &config, sst=std::move(sst)] (schema_ptr schema, const dht::partition_range&, auto&&...) {
-            auto tracker_ptr = std::make_unique<tracking_reader>(config.resources_sem, std::move(schema), std::move(sst));
+            lw_shared_ptr<sstables::sstable> sst)
+        : _reader(make_empty_flat_reader(schema))
+    {
+        auto ms = mutation_source([this, sst=std::move(sst)] (schema_ptr schema,
+                    const dht::partition_range&,
+                    const query::partition_slice&,
+                    const io_priority_class&,
+                    tracing::trace_state_ptr,
+                    streamed_mutation::forwarding,
+                    mutation_reader::forwarding,
+                    reader_resource_tracker res_tracker) {
+            auto tracker_ptr = std::make_unique<tracking_reader>(std::move(schema), std::move(sst), res_tracker);
            _tracker = tracker_ptr.get();
            return flat_mutation_reader(std::move(tracker_ptr));
        });

-        _reader = make_restricted_flat_reader(config, std::move(ms), schema);
+        _reader = make_restricted_flat_reader(semaphore, std::move(ms), schema);
    }

    future<> operator()() {
@@ -847,21 +855,6 @@ public:
    }
 };

-struct restriction_data {
-    std::unique_ptr<semaphore> reader_semaphore;
-    restricted_mutation_reader_config config;
-
-    restriction_data(std::size_t units,
-            std::chrono::nanoseconds timeout = {},
-            std::size_t max_queue_length = std::numeric_limits<std::size_t>::max())
-        : reader_semaphore(std::make_unique<semaphore>(units)) {
-        config.resources_sem = reader_semaphore.get();
-        config.timeout = timeout;
-        config.max_queue_length = max_queue_length;
-    }
-};
-
-
 class dummy_file_impl : public file_impl {
    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
        return make_ready_future<size_t>(0);
@@ -922,41 +915,43 @@ class dummy_file_impl : public file_impl {

 SEASTAR_TEST_CASE(reader_restriction_file_tracking) {
    return async([&] {
-        restriction_data rd(4 * 1024);
+        reader_concurrency_semaphore semaphore(100, 4 * 1024);
+        // Testing the tracker here, no need to have a base cost.
+        auto permit = semaphore.wait_admission(0).get0();

        {
-            reader_resource_tracker resource_tracker(rd.config.resources_sem);
+            reader_resource_tracker resource_tracker(permit);

            auto tracked_file = resource_tracker.track(
                    file(shared_ptr<file_impl>(make_shared<dummy_file_impl>())));

-            BOOST_REQUIRE_EQUAL(4 * 1024, rd.reader_semaphore->available_units());
+            BOOST_REQUIRE_EQUAL(4 * 1024, semaphore.available_resources().memory);

            auto buf1 = tracked_file.dma_read_bulk<char>(0, 0).get0();
-            BOOST_REQUIRE_EQUAL(3 * 1024, rd.reader_semaphore->available_units());
+            BOOST_REQUIRE_EQUAL(3 * 1024, semaphore.available_resources().memory);

            auto buf2 = tracked_file.dma_read_bulk<char>(0, 0).get0();
-            BOOST_REQUIRE_EQUAL(2 * 1024, rd.reader_semaphore->available_units());
+            BOOST_REQUIRE_EQUAL(2 * 1024, semaphore.available_resources().memory);

            auto buf3 = tracked_file.dma_read_bulk<char>(0, 0).get0();
-            BOOST_REQUIRE_EQUAL(1 * 1024, rd.reader_semaphore->available_units());
+            BOOST_REQUIRE_EQUAL(1 * 1024, semaphore.available_resources().memory);

            auto buf4 = tracked_file.dma_read_bulk<char>(0, 0).get0();
-            BOOST_REQUIRE_EQUAL(0 * 1024, rd.reader_semaphore->available_units());
+            BOOST_REQUIRE_EQUAL(0 * 1024, semaphore.available_resources().memory);

            auto buf5 = tracked_file.dma_read_bulk<char>(0, 0).get0();
-            BOOST_REQUIRE_EQUAL(-1 * 1024, rd.reader_semaphore->available_units());
+            BOOST_REQUIRE_EQUAL(-1 * 1024, semaphore.available_resources().memory);

            // Reassing buf1, should still have the same amount of units.
            buf1 = tracked_file.dma_read_bulk<char>(0, 0).get0();
-            BOOST_REQUIRE_EQUAL(-1 * 1024, rd.reader_semaphore->available_units());
+            BOOST_REQUIRE_EQUAL(-1 * 1024, semaphore.available_resources().memory);

            // Move buf1 to the heap, so that we can safely destroy it
            auto buf1_ptr = std::make_unique<temporary_buffer<char>>(std::move(buf1));
-            BOOST_REQUIRE_EQUAL(-1 * 1024, rd.reader_semaphore->available_units());
+            BOOST_REQUIRE_EQUAL(-1 * 1024, semaphore.available_resources().memory);

            buf1_ptr.reset();
-            BOOST_REQUIRE_EQUAL(0 * 1024, rd.reader_semaphore->available_units());
+            BOOST_REQUIRE_EQUAL(0 * 1024, semaphore.available_resources().memory);

            // Move tracked_file to the heap, so that we can safely destroy it.
            auto tracked_file_ptr = std::make_unique<file>(std::move(tracked_file));
@@ -964,126 +959,188 @@ SEASTAR_TEST_CASE(reader_restriction_file_tracking) {

            // Move buf4 to the heap, so that we can safely destroy it
            auto buf4_ptr = std::make_unique<temporary_buffer<char>>(std::move(buf4));
-            BOOST_REQUIRE_EQUAL(0 * 1024, rd.reader_semaphore->available_units());
+            BOOST_REQUIRE_EQUAL(0 * 1024, semaphore.available_resources().memory);

            // Releasing buffers that overlived the tracked-file they
            // originated from should succeed.
            buf4_ptr.reset();
-            BOOST_REQUIRE_EQUAL(1 * 1024, rd.reader_semaphore->available_units());
+            BOOST_REQUIRE_EQUAL(1 * 1024, semaphore.available_resources().memory);
        }

        // All units should have been deposited back.
-        REQUIRE_EVENTUALLY_EQUAL(4 * 1024, rd.reader_semaphore->available_units());
+        REQUIRE_EVENTUALLY_EQUAL(4 * 1024, semaphore.available_resources().memory);
    });
 }

 SEASTAR_TEST_CASE(restricted_reader_reading) {
    return async([&] {
        storage_service_for_tests ssft;
-        restriction_data rd(new_reader_base_cost);
+        reader_concurrency_semaphore semaphore(2, new_reader_base_cost);

        {
            simple_schema s;
            auto tmp = make_lw_shared<tmpdir>();
            auto sst = create_sstable(s, tmp->path);

-            auto reader1 = reader_wrapper(rd.config, s.schema(), sst);
+            auto reader1 = reader_wrapper(semaphore, s.schema(), sst);

            reader1().get();

-            BOOST_REQUIRE_LE(rd.reader_semaphore->available_units(), 0);
+            BOOST_REQUIRE_LE(semaphore.available_resources().count, 1);
+            BOOST_REQUIRE_LE(semaphore.available_resources().memory, 0);
            BOOST_REQUIRE_EQUAL(reader1.call_count(), 1);

-            auto reader2 = reader_wrapper(rd.config, s.schema(), sst);
-            auto read_fut = reader2();
+            auto reader2 = reader_wrapper(semaphore, s.schema(), sst);
+            auto read2_fut = reader2();

-            // reader2 shouldn't be allowed just yet.
+            // reader2 shouldn't be allowed yet
            BOOST_REQUIRE_EQUAL(reader2.call_count(), 0);
+            BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
+
+            auto reader3 = reader_wrapper(semaphore, s.schema(), sst);
+            auto read3_fut = reader3();
+
+            // reader3 shouldn't be allowed yet
+            BOOST_REQUIRE_EQUAL(reader3.call_count(), 0);
+            BOOST_REQUIRE_EQUAL(semaphore.waiters(), 2);

            // Move reader1 to the heap, so that we can safely destroy it.
            auto reader1_ptr = std::make_unique<reader_wrapper>(std::move(reader1));
            reader1_ptr.reset();

-            // reader1's destruction should've made some space for reader2 by now.
+            // reader1's destruction should've freed up enough memory for
+            // reader2 by now.
            REQUIRE_EVENTUALLY_EQUAL(reader2.call_count(), 1);
-            read_fut.get();
+            read2_fut.get();
+
+            // But reader3 should still not be allowed
+            BOOST_REQUIRE_EQUAL(reader3.call_count(), 0);
+            BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
+
+            // Move reader2 to the heap, so that we can safely destroy it.
+            auto reader2_ptr = std::make_unique<reader_wrapper>(std::move(reader2));
+            reader2_ptr.reset();
+
+            // Again, reader2's destruction should've freed up enough memory
+            // for reader3 by now.
+            REQUIRE_EVENTUALLY_EQUAL(reader3.call_count(), 1);
+            BOOST_REQUIRE_EQUAL(semaphore.waiters(), 0);
+            read3_fut.get();

            {
-                // Consume all available units.
-                const auto consume_guard = consume_units(*rd.reader_semaphore, rd.reader_semaphore->current());
+                BOOST_REQUIRE_LE(semaphore.available_resources().memory, 0);

                // Already allowed readers should not be blocked anymore even if
                // there are no more units available.
-                read_fut = reader2();
-                BOOST_REQUIRE_EQUAL(reader2.call_count(), 2);
-                read_fut.get();
+                read3_fut = reader3();
+                BOOST_REQUIRE_EQUAL(reader3.call_count(), 2);
+                read3_fut.get();
            }
        }

        // All units should have been deposited back.
-        REQUIRE_EVENTUALLY_EQUAL(new_reader_base_cost, rd.reader_semaphore->available_units());
+        REQUIRE_EVENTUALLY_EQUAL(new_reader_base_cost, semaphore.available_resources().memory);
    });
 }

 SEASTAR_TEST_CASE(restricted_reader_timeout) {
+    using namespace std::chrono_literals;
+
    return async([&] {
        storage_service_for_tests ssft;
-        restriction_data rd(new_reader_base_cost, std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::milliseconds{10}));
+        reader_concurrency_semaphore semaphore(2,
+                new_reader_base_cost,
+                std::chrono::duration_cast<reader_concurrency_semaphore::timeout_clock::duration>(10ms));

        {
            simple_schema s;
            auto tmp = make_lw_shared<tmpdir>();
            auto sst = create_sstable(s, tmp->path);

-            auto reader1 = reader_wrapper(rd.config, s.schema(), sst);
+            auto reader1 = reader_wrapper(semaphore, s.schema(), sst);
            reader1().get();

-            auto reader2 = reader_wrapper(rd.config, s.schema(), sst);
-            auto read_fut = reader2();
+            auto reader2 = reader_wrapper(semaphore, s.schema(), sst);
+            auto read2_fut = reader2();
+
+            auto reader3 = reader_wrapper(semaphore, s.schema(), sst);
+            auto read3_fut = reader3();
+
+            BOOST_REQUIRE_EQUAL(semaphore.waiters(), 2);

            seastar::sleep(std::chrono::milliseconds(20)).get();

-            // The read should have timed out.
-            BOOST_REQUIRE(read_fut.failed());
-            BOOST_REQUIRE_THROW(std::rethrow_exception(read_fut.get_exception()), semaphore_timed_out);
+            // Altough we have regular BOOST_REQUIREs for this below, if
+            // the test goes wrong these futures will be still pending
+            // when we leave scope and deleted memory will be accessed.
+            // To stop people from trying to debug a failing test just
+            // assert here so they know this is really just the test
+            // failing and the underlying problem is that the timeout
+            // doesn't work.
+            assert(read2_fut.failed());
+            assert(read3_fut.failed());
+
+            // reader2 should have timed out.
+            BOOST_REQUIRE(read2_fut.failed());
+            BOOST_REQUIRE_THROW(std::rethrow_exception(read2_fut.get_exception()), semaphore_timed_out);
+
+            // readerk should have timed out.
+            BOOST_REQUIRE(read3_fut.failed());
+            BOOST_REQUIRE_THROW(std::rethrow_exception(read3_fut.get_exception()), semaphore_timed_out);
        }

        // All units should have been deposited back.
-        REQUIRE_EVENTUALLY_EQUAL(new_reader_base_cost, rd.reader_semaphore->available_units());
+        REQUIRE_EVENTUALLY_EQUAL(new_reader_base_cost, semaphore.available_resources().memory);
    });
 }

 SEASTAR_TEST_CASE(restricted_reader_max_queue_length) {
    return async([&] {
        storage_service_for_tests ssft;
-        restriction_data rd(new_reader_base_cost, {}, 1);
+
+        struct queue_overloaded_exception {};
+
+        reader_concurrency_semaphore semaphore(2,
+                new_reader_base_cost,
+                reader_concurrency_semaphore::no_timeout,
+                2,
+                [] { return std::make_exception_ptr(queue_overloaded_exception()); });

        {
            simple_schema s;
            auto tmp = make_lw_shared<tmpdir>();
            auto sst = create_sstable(s, tmp->path);

-            auto reader1_ptr = std::make_unique<reader_wrapper>(rd.config, s.schema(), sst);
+            auto reader1_ptr = std::make_unique<reader_wrapper>(semaphore, s.schema(), sst);
            (*reader1_ptr)().get();

-            auto reader2_ptr = std::make_unique<reader_wrapper>(rd.config, s.schema(), sst);
-            auto read_fut = (*reader2_ptr)();
+            auto reader2_ptr = std::make_unique<reader_wrapper>(semaphore, s.schema(), sst);
+            auto read2_fut = (*reader2_ptr)();
+
+            auto reader3_ptr = std::make_unique<reader_wrapper>(semaphore, s.schema(), sst);
+            auto read3_fut = (*reader3_ptr)();
+
+            auto reader4 = reader_wrapper(semaphore, s.schema(), sst);
+
+            BOOST_REQUIRE_EQUAL(semaphore.waiters(), 2);

            // The queue should now be full.
-            BOOST_REQUIRE_THROW(reader_wrapper(rd.config, s.schema(), sst), std::runtime_error);
+            BOOST_REQUIRE_THROW(reader4().get(), queue_overloaded_exception);

            reader1_ptr.reset();
-            read_fut.get();
+            read2_fut.get();
+            reader2_ptr.reset();
+            read3_fut.get();
        }

-        REQUIRE_EVENTUALLY_EQUAL(new_reader_base_cost, rd.reader_semaphore->available_units());
+        REQUIRE_EVENTUALLY_EQUAL(new_reader_base_cost, semaphore.available_resources().memory);
    });
 }

 SEASTAR_TEST_CASE(restricted_reader_create_reader) {
    return async([&] {
        storage_service_for_tests ssft;
-        restriction_data rd(new_reader_base_cost);
+        reader_concurrency_semaphore semaphore(100, new_reader_base_cost);

        {
            simple_schema s;
@@ -1091,7 +1148,7 @@ SEASTAR_TEST_CASE(restricted_reader_create_reader) {
            auto sst = create_sstable(s, tmp->path);

            {
-                auto reader = reader_wrapper(rd.config, s.schema(), sst);
+                auto reader = reader_wrapper(semaphore, s.schema(), sst);
                // This fast-forward is stupid, I know but the
                // underlying dummy reader won't care, so it's fine.
                reader.fast_forward_to(query::full_partition_range).get();
@@ -1102,7 +1159,7 @@ SEASTAR_TEST_CASE(restricted_reader_create_reader) {
            }

            {
-                auto reader = reader_wrapper(rd.config, s.schema(), sst);
+                auto reader = reader_wrapper(semaphore, s.schema(), sst);
                reader().get();

                BOOST_REQUIRE(reader.created());
@@ -1111,6 +1168,6 @@ SEASTAR_TEST_CASE(restricted_reader_create_reader) {
            }
        }

-        REQUIRE_EVENTUALLY_EQUAL(new_reader_base_cost, rd.reader_semaphore->available_units());
+        REQUIRE_EVENTUALLY_EQUAL(new_reader_base_cost, semaphore.available_resources().memory);
    });
 }
--- a/tests/mutation_source_test.cc
+++ b/tests/mutation_source_test.cc
@@ -656,6 +656,46 @@ void test_streamed_mutation_fragments_have_monotonic_positions(populate_fn popul
    });
 }

+static void test_date_tiered_clustering_slicing(populate_fn populate) {
+    BOOST_TEST_MESSAGE(__PRETTY_FUNCTION__);
+
+    simple_schema ss;
+
+    auto s = schema_builder(ss.schema())
+        .set_compaction_strategy(sstables::compaction_strategy_type::date_tiered)
+        .build();
+
+    auto pkey = ss.make_pkey();
+
+    mutation m1(pkey, s);
+    ss.add_static_row(m1, "s");
+    m1.partition().apply(ss.new_tombstone());
+    ss.add_row(m1, ss.make_ckey(0), "v1");
+
+    mutation_source ms = populate(s, {m1});
+
+    // query row outside the range of existing rows to exercise sstable clustering key filter
+    {
+        auto slice = partition_slice_builder(*s)
+            .with_range(ss.make_ckey_range(1, 2))
+            .build();
+        auto prange = dht::partition_range::make_singular(pkey);
+        assert_that(ms(s, prange, slice))
+            .produces(m1, slice.row_ranges(*s, pkey.key()))
+            .produces_end_of_stream();
+    }
+
+    {
+        auto slice = partition_slice_builder(*s)
+            .with_range(query::clustering_range::make_singular(ss.make_ckey(0)))
+            .build();
+        auto prange = dht::partition_range::make_singular(pkey);
+        assert_that(ms(s, prange, slice))
+            .produces(m1)
+            .produces_end_of_stream();
+    }
+}
+
 static void test_clustering_slices(populate_fn populate) {
    BOOST_TEST_MESSAGE(__PRETTY_FUNCTION__);
    auto s = schema_builder("ks", "cf")
@@ -819,6 +859,7 @@ static void test_query_only_static_row(populate_fn populate) {
    auto pkeys = s.make_pkeys(1);

    mutation m1(pkeys[0], s.schema());
+    m1.partition().apply(s.new_tombstone());
    s.add_static_row(m1, "s1");
    s.add_row(m1, s.make_ckey(0), "v1");
    s.add_row(m1, s.make_ckey(1), "v2");
@@ -843,6 +884,59 @@ static void test_query_only_static_row(populate_fn populate) {
            .produces(m1, slice.row_ranges(*s.schema(), m1.key()))
            .produces_end_of_stream();
    }
+
+    // query just a static row, single-partition case
+    {
+        auto slice = partition_slice_builder(*s.schema())
+            .with_ranges({})
+            .build();
+        auto prange = dht::partition_range::make_singular(m1.decorated_key());
+        assert_that(ms(s.schema(), prange, slice))
+            .produces(m1, slice.row_ranges(*s.schema(), m1.key()))
+            .produces_end_of_stream();
+    }
+}
+
+static void test_query_no_clustering_ranges_no_static_columns(populate_fn populate) {
+    simple_schema s(simple_schema::with_static::no);
+
+    auto pkeys = s.make_pkeys(1);
+
+    mutation m1(pkeys[0], s.schema());
+    m1.partition().apply(s.new_tombstone());
+    s.add_row(m1, s.make_ckey(0), "v1");
+    s.add_row(m1, s.make_ckey(1), "v2");
+
+    mutation_source ms = populate(s.schema(), {m1});
+
+    {
+        auto prange = dht::partition_range::make_ending_with(dht::ring_position(m1.decorated_key()));
+        assert_that(ms.make_flat_mutation_reader(s.schema(), prange, s.schema()->full_slice()))
+            .produces(m1)
+            .produces_end_of_stream();
+    }
+
+    // multi-partition case
+    {
+        auto slice = partition_slice_builder(*s.schema())
+            .with_ranges({})
+            .build();
+        auto prange = dht::partition_range::make_ending_with(dht::ring_position(m1.decorated_key()));
+        assert_that(ms(s.schema(), prange, slice))
+            .produces(m1, slice.row_ranges(*s.schema(), m1.key()))
+            .produces_end_of_stream();
+    }
+
+    // single-partition case
+    {
+        auto slice = partition_slice_builder(*s.schema())
+            .with_ranges({})
+            .build();
+        auto prange = dht::partition_range::make_singular(m1.decorated_key());
+        assert_that(ms(s.schema(), prange, slice))
+            .produces(m1, slice.row_ranges(*s.schema(), m1.key()))
+            .produces_end_of_stream();
+    }
 }

 void test_streamed_mutation_forwarding_succeeds_with_no_data(populate_fn populate) {
@@ -881,6 +975,7 @@ void test_streamed_mutation_forwarding_succeeds_with_no_data(populate_fn populat
 }

 void run_mutation_reader_tests(populate_fn populate) {
+    test_date_tiered_clustering_slicing(populate);
    test_fast_forwarding_across_partitions_to_empty_range(populate);
    test_clustering_slices(populate);
    test_streamed_mutation_fragments_have_monotonic_positions(populate);
@@ -890,6 +985,7 @@ void run_mutation_reader_tests(populate_fn populate) {
    test_streamed_mutation_forwarding_is_consistent_with_slicing(populate);
    test_range_queries(populate);
    test_query_only_static_row(populate);
+    test_query_no_clustering_ranges_no_static_columns(populate);
 }

 void run_conversion_to_mutation_reader_tests(populate_fn populate) {
--- a/tests/row_cache_test.cc
+++ b/tests/row_cache_test.cc
@@ -485,7 +485,7 @@ SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_multiple_mutation
            test(ds, query::full_partition_range, partitions.size() + 1);
            test(ds, query::full_partition_range, partitions.size() + 1);

-            cache->invalidate([] {}, key_after_all);
+            cache->invalidate([] {}, key_after_all).get();

            assert_that(ds(s, query::full_partition_range))
                .produces(slice(partitions, query::full_partition_range))
--- a/tests/simple_schema.hh
+++ b/tests/simple_schema.hh
@@ -30,6 +30,7 @@
 #include "mutation.hh"
 #include "schema_builder.hh"
 #include "streamed_mutation.hh"
+#include "sstable_utils.hh"

 // Helper for working with the following table:
 //
@@ -47,11 +48,12 @@ public:
        return {new_timestamp(), gc_clock::now()};
    }
 public:
-    simple_schema()
+    using with_static = bool_class<class static_tag>;
+    simple_schema(with_static ws = with_static::yes)
        : _s(schema_builder("ks", "cf")
            .with_column("pk", utf8_type, column_kind::partition_key)
            .with_column("ck", utf8_type, column_kind::clustering_key)
-            .with_column("s1", utf8_type, column_kind::static_column)
+            .with_column("s1", utf8_type, ws ? column_kind::static_column : column_kind::regular_column)
            .with_column("v", utf8_type)
            .build())
        , _v_def(*_s->get_column_definition(to_bytes("v")))
@@ -146,12 +148,14 @@ public:

    // Creates a sequence of keys in ring order
    std::vector<dht::decorated_key> make_pkeys(int n) {
-        std::vector<dht::decorated_key> keys;
-        for (int i = 0; i < n; ++i) {
-            keys.push_back(make_pkey(i));
-        }
-        std::sort(keys.begin(), keys.end(), dht::decorated_key::less_comparator(_s));
-        return keys;
+        auto local_keys = make_local_keys(n, _s);
+        return boost::copy_range<std::vector<dht::decorated_key>>(local_keys | boost::adaptors::transformed([this] (sstring& key) {
+            return make_pkey(std::move(key));
+        }));
+    }
+
+    dht::decorated_key make_pkey() {
+        return make_pkey(make_local_key(_s));
    }

    static std::vector<dht::ring_position> to_ring_positions(const std::vector<dht::decorated_key>& keys) {
--- a/tests/sstable_utils.hh
+++ b/tests/sstable_utils.hh
@@ -19,6 +19,45 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#pragma once
+
 #include "sstables/sstables.hh"
+#include "dht/i_partitioner.hh"
+#include <boost/range/irange.hpp>
+#include <boost/range/adaptor/map.hpp>

 sstables::shared_sstable make_sstable_containing(std::function<sstables::shared_sstable()> sst_factory, std::vector<mutation> muts);
+
+//
+// Make set of keys sorted by token for current shard.
+//
+static std::vector<sstring> make_local_keys(unsigned n, const schema_ptr& s, size_t min_key_size = 1) {
+    std::vector<std::pair<sstring, dht::decorated_key>> p;
+    p.reserve(n);
+
+    auto key_id = 0U;
+    auto generated = 0U;
+    while (generated < n) {
+        auto raw_key = sstring(std::max(min_key_size, sizeof(key_id)), int8_t(0));
+        std::copy_n(reinterpret_cast<int8_t*>(&key_id), sizeof(key_id), raw_key.begin());
+        auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_single_value(*s, to_bytes(raw_key)));
+        key_id++;
+
+        if (engine_is_ready() && engine().cpu_id() != dht::global_partitioner().shard_of(dk.token())) {
+            continue;
+        }
+        generated++;
+        p.emplace_back(std::move(raw_key), std::move(dk));
+    }
+    boost::sort(p, [&] (auto& p1, auto& p2) {
+        return p1.second.less_compare(*s, p2.second);
+    });
+    return boost::copy_range<std::vector<sstring>>(p | boost::adaptors::map_keys);
+}
+
+//
+// Return one key for current shard. Note that it always returns the same key for a given shard.
+//
+inline sstring make_local_key(const schema_ptr& s, size_t min_key_size = 1) {
+    return make_local_keys(1, s, min_key_size).front();
+}
--- a/tests/streamed_mutation_test.cc
+++ b/tests/streamed_mutation_test.cc
@@ -178,6 +178,7 @@ SEASTAR_TEST_CASE(test_mutation_merger_conforms_to_mutation_source) {
                    muts.push_back(mutation(m.decorated_key(), m.schema()));
                }
                fragment_scatterer c{muts};
+                c.consume(m.partition().partition_tombstone());
                auto sm = streamed_mutation_from_mutation(m);
                do_consume_streamed_mutation_flattened(sm, c).get();
                for (int i = 0; i < n; ++i) {
--- a/transport/event_notifier.cc
+++ b/transport/event_notifier.cc
@@ -66,12 +66,12 @@ void cql_server::event_notifier::on_create_keyspace(const sstring& ks_name)
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::CREATED,
                ks_name
            }));
-        });
+        };
    }
 }

@@ -79,14 +79,14 @@ void cql_server::event_notifier::on_create_column_family(const sstring& ks_name,
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::CREATED,
                event::schema_change::target_type::TABLE,
                ks_name,
                cf_name
            }));
-        });
+        };
    }
 }

@@ -94,14 +94,14 @@ void cql_server::event_notifier::on_create_user_type(const sstring& ks_name, con
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::CREATED,
                event::schema_change::target_type::TYPE,
                ks_name,
                type_name
            }));
-        });
+        };
    }
 }

@@ -124,12 +124,12 @@ void cql_server::event_notifier::on_update_keyspace(const sstring& ks_name)
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::UPDATED,
                ks_name
            }));
-        });
+        };
    }
 }

@@ -137,14 +137,14 @@ void cql_server::event_notifier::on_update_column_family(const sstring& ks_name,
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::UPDATED,
                event::schema_change::target_type::TABLE,
                ks_name,
                cf_name
            }));
-        });
+        };
    }
 }

@@ -152,14 +152,14 @@ void cql_server::event_notifier::on_update_user_type(const sstring& ks_name, con
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::UPDATED,
                event::schema_change::target_type::TYPE,
                ks_name,
                type_name
            }));
-        });
+        };
    }
 }

@@ -182,12 +182,12 @@ void cql_server::event_notifier::on_drop_keyspace(const sstring& ks_name)
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::DROPPED,
                ks_name
            }));
-        });
+        };
    }
 }

@@ -195,14 +195,14 @@ void cql_server::event_notifier::on_drop_column_family(const sstring& ks_name, c
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::DROPPED,
                event::schema_change::target_type::TABLE,
                ks_name,
                cf_name
            }));
-        });
+        };
    }
 }

@@ -210,14 +210,14 @@ void cql_server::event_notifier::on_drop_user_type(const sstring& ks_name, const
 {
    for (auto&& conn : _schema_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_schema_change_event(event::schema_change{
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_schema_change_event(event::schema_change{
                event::schema_change::change_type::DROPPED,
                event::schema_change::target_type::TYPE,
                ks_name,
                type_name
            }));
-        });
+        };
    }
 }

@@ -240,9 +240,9 @@ void cql_server::event_notifier::on_join_cluster(const gms::inet_address& endpoi
 {
    for (auto&& conn : _topology_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_topology_change_event(event::topology_change::new_node(endpoint, conn->_server_addr.port)));
-        });
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_topology_change_event(event::topology_change::new_node(endpoint, conn->_server_addr.port)));
+        };
    }
 }

@@ -250,9 +250,9 @@ void cql_server::event_notifier::on_leave_cluster(const gms::inet_address& endpo
 {
    for (auto&& conn : _topology_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_topology_change_event(event::topology_change::removed_node(endpoint, conn->_server_addr.port)));
-        });
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_topology_change_event(event::topology_change::removed_node(endpoint, conn->_server_addr.port)));
+        };
    }
 }

@@ -260,9 +260,9 @@ void cql_server::event_notifier::on_move(const gms::inet_address& endpoint)
 {
    for (auto&& conn : _topology_change_listeners) {
        using namespace cql_transport;
-        with_gate(conn->_pending_requests_gate, [&] {
-            return conn->write_response(conn->make_topology_change_event(event::topology_change::moved_node(endpoint, conn->_server_addr.port)));
-        });
+        if (!conn->_pending_requests_gate.is_closed()) {
+            conn->write_response(conn->make_topology_change_event(event::topology_change::moved_node(endpoint, conn->_server_addr.port)));
+        };
    }
 }

@@ -273,9 +273,9 @@ void cql_server::event_notifier::on_up(const gms::inet_address& endpoint)
    if (!was_up) {
        for (auto&& conn : _status_change_listeners) {
            using namespace cql_transport;
-            with_gate(conn->_pending_requests_gate, [&] {
-                return conn->write_response(conn->make_status_change_event(event::status_change::node_up(endpoint, conn->_server_addr.port)));
-            });
+            if (!conn->_pending_requests_gate.is_closed()) {
+                conn->write_response(conn->make_status_change_event(event::status_change::node_up(endpoint, conn->_server_addr.port)));
+            };
        }
    }
 }
@@ -287,9 +287,9 @@ void cql_server::event_notifier::on_down(const gms::inet_address& endpoint)
    if (!was_down) {
        for (auto&& conn : _status_change_listeners) {
            using namespace cql_transport;
-            with_gate(conn->_pending_requests_gate, [&] {
-                return conn->write_response(conn->make_status_change_event(event::status_change::node_down(endpoint, conn->_server_addr.port)));
-            });
+            if (!conn->_pending_requests_gate.is_closed()) {
+                conn->write_response(conn->make_status_change_event(event::status_change::node_down(endpoint, conn->_server_addr.port)));
+            };
        }
    }
 }
--- a/transport/server.cc
+++ b/transport/server.cc
@@ -591,8 +591,8 @@ future<> cql_server::connection::process()
            return write_response(make_error(0, exceptions::exception_code::SERVER_ERROR, "unknown error", tracing::trace_state_ptr()));
        }
    }).finally([this] {
-        _server._notifier->unregister_connection(this);
        return _pending_requests_gate.close().then([this] {
+            _server._notifier->unregister_connection(this);
            return _ready_to_respond.finally([this] {
                return _write_buf.close();
            });
				`@@ -1 +0,0 @@`
				`options raid0 devices_discard_performance=Y`