commitlog/replayer: Bugfix: minimum rp broken, and cl reader offset too

The previous fix removed the additional insertion of "min rp" per source shard based on whether we had processed existing CF:s or not (i.e. if a CF does not exist as sstable at all, we must tag it as zero-rp, and make whole shard for it start at same zero. This is bad in itself, because it can cause data loss. It does not cause crashing however. But it did uncover another, old old lingering bug, namely the commitlog reader initiating its stream wrongly when reading from an actual offset (i.e. not processing the whole file). We opened the file stream from the file offset, then tried to read the file header and magic number from there -> boom, error. Also, rp-to-file mapping was potentially suboptimal due to using bucket iterator instead of actual range. I.e. three fixes: * Reinstate min position guarding for unencoutered CF:s * Fix stream creating in CL reader * Fix segment map iterator use. v2: * Fix typo Message-Id: <1490611637-12220-1-git-send-email-calle@scylladb.com> (cherry picked from commit b12b65db92)
release: prepare for 1.7.rc2
2017-03-28 10:35:04 +02:00 · 2017-03-23 13:22:59 +02:00 · 2017-03-23 09:42:51 +02:00 · 2017-03-22 17:57:30 +02:00 · 2017-03-22 14:55:39 +02:00 · 2017-03-21 15:07:27 +02:00
62 changed files with 1135 additions and 539 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=1.7.rc2

 if test -f version
 then
--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -246,7 +246,8 @@ future<> auth::auth::setup() {
        std::map<sstring, sstring> opts;
        opts["replication_factor"] = "1";
        auto ksm = keyspace_metadata::new_keyspace(AUTH_KS, "org.apache.cassandra.locator.SimpleStrategy", opts, true);
-        f = service::get_local_migration_manager().announce_new_keyspace(ksm, false);
+        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments. See issue #2129.
+        f = service::get_local_migration_manager().announce_new_keyspace(ksm, api::min_timestamp, false);
    }

    return f.then([] {
--- a/cell_locking.hh
+++ b/cell_locking.hh
@@ -22,13 +22,28 @@
 #pragma once

 #include <boost/intrusive/unordered_set.hpp>
+
+#if __has_include(<boost/container/small_vector.hpp>)
+
 #include <boost/container/small_vector.hpp>

+template <typename T, size_t N>
+using small_vector = boost::container::small_vector<T, N>;
+
+#else
+
+#include <vector>
+template <typename T, size_t N>
+using small_vector = std::vector<T>;
+
+#endif
+
 #include "fnv1a_hasher.hh"
+#include "streamed_mutation.hh"
 #include "mutation_partition.hh"

 class cells_range {
-    using ids_vector_type = boost::container::small_vector<column_id, 5>;
+    using ids_vector_type = small_vector<column_id, 5>;

    position_in_partition_view _position;
    ids_vector_type _ids;
@@ -147,7 +162,7 @@ class cell_locker {
        // temporarily removed from its parent partition_entry.
        // Returns true if the cell_entry still exist in the new schema and
        // should be reinserted.
-        bool upgrade(const schema& from, const schema& to, column_kind kind) {
+        bool upgrade(const schema& from, const schema& to, column_kind kind) noexcept {
            auto& old_column_mapping = from.get_column_mapping();
            auto& column = old_column_mapping.column_at(kind, _address.id);
            auto cdef = to.get_column_definition(column.name());
@@ -170,7 +185,9 @@ class cell_locker {
        }

        ~cell_entry() {
-            assert(is_linked());
+            if (!is_linked()) {
+                return;
+            }
            unlink();
            if (!--_parent._cell_count) {
                delete &_parent;
@@ -286,10 +303,9 @@ class cell_locker {
        };

        class equal_compare {
-            schema_ptr _schema;
            dht::decorated_key_equals_comparator _cmp;
        public:
-            explicit equal_compare(const schema s) : _cmp(s) { }
+            explicit equal_compare(const schema& s) : _cmp(s) { }
            bool operator()(const dht::decorated_key& dk, const partition_entry& pe) {
                return _cmp(dk, pe._key);
            }
@@ -386,22 +402,19 @@ struct cell_locker::locker {

    partition_cells_range _range;
    partition_cells_range::iterator _current_ck;
-    cells_range _cells_range;
    cells_range::const_iterator _current_cell;

    std::vector<locked_cell> _locks;
 private:
    void update_ck() {
        if (!is_done()) {
-            _cells_range = *_current_ck;
-            _current_cell = _cells_range.begin();
+            _current_cell = _current_ck->begin();
        }
    }

    future<> lock_next();

    bool is_done() const { return _current_ck == _range.end(); }
-    std::vector<locked_cell> get() && { return std::move(_locks); }
 public:
    explicit locker(const ::schema& s, partition_entry& pe, partition_cells_range&& range)
        : _hasher(s)
@@ -413,18 +426,22 @@ public:
        update_ck();
    }

-    future<std::vector<locked_cell>> lock_all() && {
+    locker(const locker&) = delete;
+    locker(locker&&) = delete;
+
+    future<> lock_all() {
        // Cannot defer before first call to lock_next().
        return lock_next().then([this] {
            return do_until([this] { return is_done(); }, [this] {
                return lock_next();
-            }).then([&] {
-                return std::move(*this).get();
            });
        });
    }
+
+    std::vector<locked_cell> get() && { return std::move(_locks); }
 };

+inline
 future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_key& dk, partition_cells_range&& range) {
    partition_entry::hasher pe_hash;
    partition_entry::equal_compare pe_eq(*_schema);
@@ -460,14 +477,17 @@ future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_ke
        return make_ready_future<std::vector<locked_cell>>(std::move(locks));
    }

-    return do_with(locker(*_schema, *it, std::move(range)), [] (auto& locker)  mutable {
-        return std::move(locker).lock_all();
+    auto l = std::make_unique<locker>(*_schema, *it, std::move(range));
+    auto f = l->lock_all();
+    return f.then([l = std::move(l)] {
+        return std::move(*l).get();
    });
 }

+inline
 future<> cell_locker::locker::lock_next() {
    while (!is_done()) {
-        if (_current_cell == _cells_range.end() || _cells_range.empty()) {
+        if (_current_cell == _current_ck->end()) {
            ++_current_ck;
            update_ck();
            continue;
@@ -475,7 +495,7 @@ future<> cell_locker::locker::lock_next() {

        auto cid = *_current_cell++;

-        cell_address ca { position_in_partition(_cells_range.position()), cid };
+        cell_address ca { position_in_partition(_current_ck->position()), cid };
        auto it = _partition_entry.cells().find(ca, _hasher, _eq_cmp);
        if (it != _partition_entry.cells().end()) {
            return it->lock().then([this, ce = it->shared_from_this()] () mutable {
@@ -483,27 +503,25 @@ future<> cell_locker::locker::lock_next() {
            });
        }

-        auto cell = make_lw_shared<cell_entry>(_partition_entry, position_in_partition(_cells_range.position()), cid);
+        auto cell = make_lw_shared<cell_entry>(_partition_entry, position_in_partition(_current_ck->position()), cid);
        _partition_entry.insert(cell);
        _locks.emplace_back(std::move(cell));
    }
    return make_ready_future<>();
 }

+inline
 bool cell_locker::partition_entry::upgrade(schema_ptr new_schema) {
    if (_schema == new_schema) {
        return true;
    }

-    auto buckets = std::make_unique<cells_type::bucket_type[]>(initial_bucket_count);
+    auto buckets = std::make_unique<cells_type::bucket_type[]>(_cells.bucket_count());
    auto cells = cells_type(cells_type::bucket_traits(buckets.get(), _cells.bucket_count()),
                            cell_entry::hasher(*new_schema), cell_entry::equal_compare(*new_schema));

-    while (!_cells.empty()) {
-        auto it = _cells.begin();
-        auto& cell = *it;
-        _cells.erase(it);
-
+    _cells.clear_and_dispose([&] (cell_entry* cell_ptr) noexcept {
+        auto& cell = *cell_ptr;
        auto kind = cell.position().is_static_row() ? column_kind::static_column
                                                    : column_kind::regular_column;
        auto reinsert = cell.upgrade(*_schema, *new_schema, kind);
@@ -512,9 +530,16 @@ bool cell_locker::partition_entry::upgrade(schema_ptr new_schema) {
        } else {
            _cell_count--;
        }
-    }
+    });

+    // bi::unordered_set move assignment is actually a swap.
+    // Original _buckets cannot be destroyed before the container using them is
+    // so we need to explicitly make sure that the original _cells is no more.
    _cells = std::move(cells);
+    auto destroy = [] (auto) { };
+    destroy(std::move(cells));
+
    _buckets = std::move(buckets);
+    _schema = new_schema;
    return _cell_count;
 }
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -788,3 +788,23 @@ commitlog_total_space_in_mb: -1
 # By default, Scylla binds all interfaces to the prometheus API
 # It is possible to restrict the listening address to a specific one
 # prometheus_address: 0.0.0.0
+
+# Distribution of data among cores (shards) within a node
+#
+# Scylla distributes data within a node among shards, using a round-robin
+# strategy:
+#  [shard0] [shard1] ... [shardN-1] [shard0] [shard1] ... [shardN-1] ...
+#
+# Scylla versions 1.6 and below used just one repetition of the pattern;
+# this intefered with data placement among nodes (vnodes).
+#
+# Scylla versions 1.7 and above use 4096 repetitions of the pattern; this
+# provides for better data distribution.
+#
+# the value below is log (base 2) of the number of repetitions.
+#
+# Set to 0 to avoid rewriting all data when upgrading from Scylla 1.6 and
+# below.
+#
+# Keep at 12 for new clusters.
+murmur3_partitioner_ignore_msb_bits: 12
--- a/configure.py
+++ b/configure.py
@@ -230,6 +230,7 @@ scylla_tests = [
    'tests/virtual_reader_test',
    'tests/view_schema_test',
    'tests/counter_test',
+    'tests/cell_locker_test',
 ]

 apps = [
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -67,6 +67,14 @@ functions::init() {
    declare(aggregate_fcts::make_max_function<int64_t>());
    declare(aggregate_fcts::make_min_function<int64_t>());

+    declare(aggregate_fcts::make_count_function<float>());
+    declare(aggregate_fcts::make_max_function<float>());
+    declare(aggregate_fcts::make_min_function<float>());
+
+    declare(aggregate_fcts::make_count_function<double>());
+    declare(aggregate_fcts::make_max_function<double>());
+    declare(aggregate_fcts::make_min_function<double>());
+
    //FIXME:
    //declare(aggregate_fcts::make_count_function<bytes>());
    //declare(aggregate_fcts::make_max_function<bytes>());
@@ -78,15 +86,17 @@ functions::init() {
    declare(make_blob_as_varchar_fct());
    declare(aggregate_fcts::make_sum_function<int32_t>());
    declare(aggregate_fcts::make_sum_function<int64_t>());
-    declare(aggregate_fcts::make_avg_function<int32_t>());
-    declare(aggregate_fcts::make_avg_function<int64_t>());
+    declare(aggregate_fcts::make_sum_function<float>());
+    declare(aggregate_fcts::make_sum_function<double>());
 #if 0
-    declare(AggregateFcts.sumFunctionForFloat);
-    declare(AggregateFcts.sumFunctionForDouble);
    declare(AggregateFcts.sumFunctionForDecimal);
    declare(AggregateFcts.sumFunctionForVarint);
-    declare(AggregateFcts.avgFunctionForFloat);
-    declare(AggregateFcts.avgFunctionForDouble);
+#endif
+    declare(aggregate_fcts::make_avg_function<int32_t>());
+    declare(aggregate_fcts::make_avg_function<int64_t>());
+    declare(aggregate_fcts::make_avg_function<float>());
+    declare(aggregate_fcts::make_avg_function<double>());
+#if 0
    declare(AggregateFcts.avgFunctionForVarint);
    declare(AggregateFcts.avgFunctionForDecimal);
 #endif
--- a/database.cc
+++ b/database.cc
@@ -1379,13 +1379,20 @@ future<> column_family::cleanup_sstables(sstables::compaction_descriptor descrip
    auto owned_ranges = make_lw_shared<dht::token_range_vector>(std::move(r));
    auto sstables_to_cleanup = make_lw_shared<std::vector<sstables::shared_sstable>>(std::move(descriptor.sstables));

-    return parallel_for_each(*sstables_to_cleanup, [this, owned_ranges = std::move(owned_ranges), sstables_to_cleanup] (auto& sst) {
+    return do_for_each(*sstables_to_cleanup, [this, owned_ranges = std::move(owned_ranges), sstables_to_cleanup] (auto& sst) {
        if (!owned_ranges->empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
           return make_ready_future<>();
        }

-        std::vector<sstables::shared_sstable> sstable_to_compact({ sst });
-        return this->compact_sstables(sstables::compaction_descriptor(std::move(sstable_to_compact), sst->get_sstable_level()), true);
+        // this semaphore ensures that only one cleanup will run per shard.
+        // That's to prevent node from running out of space when almost all sstables
+        // need cleanup, so if sstables are cleaned in parallel, we may need almost
+        // twice the disk space used by those sstables.
+        static thread_local semaphore sem(1);
+
+        return with_semaphore(sem, 1, [this, &sst] {
+            return this->compact_sstables(sstables::compaction_descriptor({ sst }, sst->get_sstable_level()), true);
+        });
    });
 }

@@ -1802,7 +1809,7 @@ database::setup_metrics() {
    });

    _metrics.add_group("database", {
-        sm::make_gauge("requests_blocked_memory", [this] { return _dirty_memory_manager.region_group().blocked_requests(); },
+        sm::make_gauge("requests_blocked_memory_current", [this] { return _dirty_memory_manager.region_group().blocked_requests(); },
                       sm::description(
                           seastar::format("Holds the current number of requests blocked due to reaching the memory quota ({}B). "
                                           "Non-zero value indicates that our bottleneck is memory and more specifically - the memory quota allocated for the \"database\" component.", _dirty_memory_manager.throttle_threshold()))),
@@ -2663,7 +2670,7 @@ column_family::apply(const frozen_mutation& m, const schema_ptr& m_schema, const
    do_apply(m, m_schema, rp);
 }

-future<frozen_mutation> database::do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema) {
+future<frozen_mutation> database::do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema, timeout_clock::time_point timeout) {
    auto m = fm.unfreeze(m_schema);
    m.upgrade(cf.schema());

@@ -2689,9 +2696,9 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
        cql_serialization_format::internal(), query::max_rows);

    return do_with(std::move(slice), std::move(m), std::vector<locked_cell>(), stdx::optional<frozen_mutation>(),
-                   [this, &cf] (const query::partition_slice& slice, mutation& m, std::vector<locked_cell>& locks,
+                   [this, &cf, timeout] (const query::partition_slice& slice, mutation& m, std::vector<locked_cell>& locks,
                               stdx::optional<frozen_mutation>& fm) mutable {
-        return cf.lock_counter_cells(m).then([&, m_schema = cf.schema(), this] (std::vector<locked_cell> lcs) {
+        return cf.lock_counter_cells(m).then([&, timeout, m_schema = cf.schema(), this] (std::vector<locked_cell> lcs) {
            locks = std::move(lcs);

            // Before counter update is applied it needs to be transformed from
@@ -2702,7 +2709,7 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
            return mutation_query(m_schema, cf.as_mutation_source({}),
                                  dht::partition_range::make_singular(m.decorated_key()),
                                  slice, query::max_rows, query::max_partitions,
-                                  gc_clock::now(), { }).then([this, &cf, &m, &fm, m_schema] (auto result) {
+                                  gc_clock::now(), { }).then([this, timeout, &cf, &m, &fm, m_schema] (auto result) {

                // ...now, that we got existing state of all affected counter
                // cells we can look for our shard in each of them, increment
@@ -2714,9 +2721,8 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
                transform_counter_updates_to_shards(m, mopt ? &*mopt : nullptr, cf.failed_counter_applies_to_memtable());

                // FIXME: oh dear, another freeze
-                // FIXME: timeout
                fm = freeze(m);
-                return this->do_apply(m_schema, *fm, { });
+                return this->do_apply(m_schema, *fm, timeout);
            }).then([&fm] {
                return std::move(*fm);
            });
@@ -2854,7 +2860,7 @@ future<> dirty_memory_manager::flush_when_needed() {
    });
 }

-void dirty_memory_manager::start_reclaiming() {
+void dirty_memory_manager::start_reclaiming() noexcept {
    _should_flush.signal();
 }

@@ -2876,7 +2882,7 @@ future<frozen_mutation> database::apply_counter_update(schema_ptr s, const froze
    }
    try {
        auto& cf = find_column_family(m.column_family_id());
-        return do_apply_counter_update(cf, m, s);
+        return do_apply_counter_update(cf, m, s, timeout);
    } catch (no_such_column_family&) {
        dblog.error("Attempting to mutate non-existent table {}", m.column_family_id());
        throw;
--- a/database.hh
+++ b/database.hh
@@ -149,7 +149,7 @@ class dirty_memory_manager: public logalloc::region_group_reclaimer {
    std::unordered_map<const logalloc::region*, flush_permit> _flush_manager;

    future<> _waiting_flush;
-    virtual void start_reclaiming() override;
+    virtual void start_reclaiming() noexcept override;

    bool has_pressure() const {
        return over_soft_limit();
@@ -1126,7 +1126,7 @@ private:

    query::result_memory_limiter _result_memory_limiter;

-    future<frozen_mutation> do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema);
+    future<frozen_mutation> do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema, timeout_clock::time_point timeout);
 public:
    static utils::UUID empty_version;

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1588,7 +1588,7 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
        bool failed = false;

        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f, o, make_file_input_stream_options())), start_off(o) {
+                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
        }
        work(work&&) = default;

--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -61,13 +61,19 @@

 static logging::logger logger("commitlog_replayer");

-struct column_mappings {
-    std::unordered_map<table_schema_version, column_mapping> map;
-    future<> stop() { return make_ready_future<>(); }
-};
-
 class db::commitlog_replayer::impl {
-    seastar::sharded<column_mappings> _column_mappings;
+    struct column_mappings {
+        std::unordered_map<table_schema_version, column_mapping> map;
+        future<> stop() { return make_ready_future<>(); }
+    };
+
+    // we want the processing methods to be const, since they use
+    // shard-sharing of data -> read only
+    // this one is special since it is thread local.
+    // Should actually make sharded::local a const function (it does
+    // not modify content), but...
+    mutable seastar::sharded<column_mappings> _column_mappings;
+
    friend class db::commitlog_replayer;
 public:
    impl(seastar::sharded<cql3::query_processor>& db);
@@ -94,13 +100,35 @@ public:
        }
    };

-    future<> process(stats*, temporary_buffer<char> buf, replay_position rp);
-    future<stats> recover(sstring file);
+    // move start/stop of the thread local bookkeep to "top level"
+    // and also make sure to assert on it actually being started.
+    future<> start() {
+        return _column_mappings.start();
+    }
+    future<> stop() {
+        return _column_mappings.stop();
+    }
+
+    future<> process(stats*, temporary_buffer<char> buf, replay_position rp) const;
+    future<stats> recover(sstring file) const;

    typedef std::unordered_map<utils::UUID, replay_position> rp_map;
    typedef std::unordered_map<unsigned, rp_map> shard_rpm_map;
    typedef std::unordered_map<unsigned, replay_position> shard_rp_map;

+    replay_position min_pos(unsigned shard) const {
+        auto i = _min_pos.find(shard);
+        return i != _min_pos.end() ? i->second : replay_position();
+    }
+    replay_position cf_min_pos(const utils::UUID& uuid, unsigned shard) const {
+        auto i = _rpm.find(shard);
+        if (i == _rpm.end()) {
+            return replay_position();
+        }
+        auto j = i->second.find(uuid);
+        return j != i->second.end() ? j->second : replay_position();
+    }
+
    seastar::sharded<cql3::query_processor>&
        _qp;
    shard_rpm_map
@@ -175,7 +203,6 @@ future<> db::commitlog_replayer::impl::init() {
                }
            }
        }
-
        for (auto&p : _min_pos) {
            logger.debug("minimum position for shard {}: {}", p.first, p.second);
        }
@@ -188,9 +215,11 @@ future<> db::commitlog_replayer::impl::init() {
 }

 future<db::commitlog_replayer::impl::stats>
-db::commitlog_replayer::impl::recover(sstring file) {
+db::commitlog_replayer::impl::recover(sstring file) const {
+    assert(_column_mappings.local_is_initialized());
+
    replay_position rp{commitlog::descriptor(file)};
-    auto gp = _min_pos[rp.shard_id()];
+    auto gp = min_pos(rp.shard_id());

    if (rp.id < gp.id) {
        logger.debug("skipping replay of fully-flushed {}", file);
@@ -220,7 +249,7 @@ db::commitlog_replayer::impl::recover(sstring file) {
    });
 }

-future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char> buf, replay_position rp) {
+future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char> buf, replay_position rp) const {
    try {

        commitlog_entry_reader cer(buf);
@@ -238,17 +267,16 @@ future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char>
        const column_mapping& src_cm = cm_it->second;

        auto shard_id = rp.shard_id();
-        if (rp < _min_pos[shard_id]) {
+        if (rp < min_pos(shard_id)) {
            logger.trace("entry {} is less than global min position. skipping", rp);
            s->skipped_mutations++;
            return make_ready_future<>();
        }

        auto uuid = fm.column_family_id();
-        auto& map = _rpm[shard_id];
-        auto i = map.find(uuid);
-        if (i != map.end() && rp <= i->second) {
-            logger.trace("entry {} at {} is younger than recorded replay position {}. skipping", fm.column_family_id(), rp, i->second);
+        auto cf_rp = cf_min_pos(uuid, shard_id);
+        if (rp <= cf_rp) {
+            logger.trace("entry {} at {} is younger than recorded replay position {}. skipping", fm.column_family_id(), rp, cf_rp);
            s->skipped_mutations++;
            return make_ready_future<>();
        }
@@ -323,42 +351,55 @@ future<db::commitlog_replayer> db::commitlog_replayer::create_replayer(seastar::
 }

 future<> db::commitlog_replayer::recover(std::vector<sstring> files) {
-  return _impl->_column_mappings.start().then([this, files = std::move(files)] {
+    typedef std::unordered_multimap<unsigned, sstring> shard_file_map;
+
    logger.info("Replaying {}", join(", ", files));
-    return map_reduce(files, [this](auto f) {
-        logger.debug("Replaying {}", f);
-        return _impl->recover(f).then([f](impl::stats stats) {
-            if (stats.corrupt_bytes != 0) {
-                logger.warn("Corrupted file: {}. {} bytes skipped.", f, stats.corrupt_bytes);
-            }
-            logger.debug("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
-                            , f
-                            , stats.applied_mutations
-                            , stats.invalid_mutations
-                            , stats.skipped_mutations
+
+    // pre-compute work per shard already.
+    auto map = ::make_lw_shared<shard_file_map>();
+    for (auto& f : files) {
+        commitlog::descriptor d(f);
+        replay_position p = d;
+        map->emplace(p.shard_id() % smp::count, std::move(f));
+    }
+
+    return _impl->start().then([this, map] {
+        return map_reduce(smp::all_cpus(), [this, map](unsigned id) {
+            return smp::submit_to(id, [this, id, map]() {
+                auto total = ::make_lw_shared<impl::stats>();
+                // TODO: or something. For now, we do this serialized per shard,
+                // to reduce mutation congestion. We could probably (says avi)
+                // do 2 segments in parallel or something, but lets use this first.
+                auto range = map->equal_range(id);
+                return do_for_each(range.first, range.second, [this, total](const std::pair<unsigned, sstring>& p) {
+                    auto&f = p.second;
+                    logger.debug("Replaying {}", f);
+                    return _impl->recover(f).then([f, total](impl::stats stats) {
+                        if (stats.corrupt_bytes != 0) {
+                            logger.warn("Corrupted file: {}. {} bytes skipped.", f, stats.corrupt_bytes);
+                        }
+                        logger.debug("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
+                                        , f
+                                        , stats.applied_mutations
+                                        , stats.invalid_mutations
+                                        , stats.skipped_mutations
+                        );
+                        *total += stats;
+                    });
+                }).then([total] {
+                    return make_ready_future<impl::stats>(*total);
+                });
+            });
+        }, impl::stats(), std::plus<impl::stats>()).then([](impl::stats totals) {
+            logger.info("Log replay complete, {} replayed mutations ({} invalid, {} skipped)"
+                            , totals.applied_mutations
+                            , totals.invalid_mutations
+                            , totals.skipped_mutations
            );
-            return make_ready_future<impl::stats>(stats);
-        }).handle_exception([f](auto ep) -> future<impl::stats> {
-            logger.error("Error recovering {}: {}", f, ep);
-            try {
-                std::rethrow_exception(ep);
-            } catch (std::invalid_argument&) {
-                logger.error("Scylla cannot process {}. Make sure to fully flush all Cassandra commit log files to sstable before migrating.", f);
-                throw;
-            } catch (...) {
-                throw;
-            }
        });
-    }, impl::stats(), std::plus<impl::stats>()).then([](impl::stats totals) {
-        logger.info("Log replay complete, {} replayed mutations ({} invalid, {} skipped)"
-                        , totals.applied_mutations
-                        , totals.invalid_mutations
-                        , totals.skipped_mutations
-        );
    }).finally([this] {
-        return _impl->_column_mappings.stop();
+        return _impl->stop();
    });
-  });
 }

 future<> db::commitlog_replayer::recover(sstring f) {
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -77,6 +77,15 @@ namespace schema_tables {

 logging::logger logger("schema_tables");

+struct push_back_and_return {
+    std::vector<mutation> muts;
+
+    std::vector<mutation> operator()(mutation&& m) {
+        muts.emplace_back(std::move(m));
+        return std::move(muts);
+    }
+};
+
 struct qualified_name {
    sstring keyspace_name;
    sstring table_name;
@@ -547,6 +556,14 @@ read_schema_partition_for_table(distributed<service::storage_proxy>& proxy, sche
    return query_partition_mutation(proxy.local(), std::move(schema), std::move(cmd), std::move(keyspace_key));
 }

+future<mutation>
+read_keyspace_mutation(distributed<service::storage_proxy>& proxy, const sstring& keyspace_name) {
+    schema_ptr s = keyspaces();
+    auto key = partition_key::from_singular(*s, keyspace_name);
+    auto cmd = make_lw_shared<query::read_command>(s->id(), s->version(), query::full_slice);
+    return query_partition_mutation(proxy.local(), std::move(s), std::move(cmd), std::move(key));
+}
+
 static semaphore the_merge_lock {1};

 future<> merge_lock() {
@@ -1182,19 +1199,18 @@ void add_type_to_schema_mutation(user_type type, api::timestamp_type timestamp,
    mutations.emplace_back(std::move(m));
 }

-std::vector<mutation> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    add_type_to_schema_mutation(type, timestamp, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

-std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
-
+    std::vector<mutation> mutations;
    schema_ptr s = usertypes();
    auto pkey = partition_key::from_singular(*s, type->_keyspace);
    auto ckey = clustering_key::from_singular(*s, type->get_name_as_string());
@@ -1202,19 +1218,21 @@ std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata>
    m.partition().apply_delete(*s, ckey, tombstone(timestamp, gc_clock::now()));
    mutations.emplace_back(std::move(m));

-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 /*
 * Table metadata serialization/deserialization.
 */

-std::vector<mutation> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    add_table_or_view_to_schema_mutation(table, timestamp, true, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_type timestamp, bool with_columns_and_triggers)
@@ -1347,15 +1365,13 @@ static void make_update_columns_mutations(schema_ptr old_table,
    mutations.emplace_back(std::move(columns_mutation));
 }

-std::vector<mutation> make_update_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
+future<std::vector<mutation>> make_update_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
    schema_ptr old_table,
    schema_ptr new_table,
    api::timestamp_type timestamp,
    bool from_thrift)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
-
+    std::vector<mutation> mutations;
    add_table_or_view_to_schema_mutation(new_table, timestamp, false, mutations);

    make_update_columns_mutations(std::move(old_table), std::move(new_table), timestamp, from_thrift, mutations);
@@ -1373,7 +1389,8 @@ std::vector<mutation> make_update_table_mutations(lw_shared_ptr<keyspace_metadat
            addTriggerToSchemaMutation(newTable, trigger, timestamp, mutation);

 #endif
-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 static void make_drop_table_or_view_mutations(schema_ptr schema_table,
@@ -1390,10 +1407,9 @@ static void make_drop_table_or_view_mutations(schema_ptr schema_table,
    }
 }

-std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    make_drop_table_or_view_mutations(columnfamilies(), std::move(table), timestamp, mutations);

 #if 0
@@ -1405,7 +1421,8 @@ std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata>
    for (String indexName : Keyspace.open(keyspace.name).getColumnFamilyStore(table.cfName).getBuiltIndexes())
        indexCells.addTombstone(indexCells.getComparator().makeCellName(indexName), ldt, timestamp);
 #endif
-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 static future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy, const qualified_name& table, schema_ptr s)
@@ -1899,37 +1916,39 @@ schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timesta
    return s->is_view() ? make_view_mutations(view_ptr(s), timestamp, with_columns) : make_table_mutations(s, timestamp, with_columns);
 }

-std::vector<mutation> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    // And also the serialized base table.
    auto base = keyspace->cf_meta_data().at(view->view_info()->base_name());
    add_table_or_view_to_schema_mutation(base, timestamp, true, mutations);
    add_table_or_view_to_schema_mutation(view, timestamp, true, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

-std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
+future<std::vector<mutation>> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
                                                 view_ptr old_view,
                                                 view_ptr new_view,
                                                 api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    // And also the serialized base table.
    auto base = keyspace->cf_meta_data().at(new_view->view_info()->base_name());
    add_table_or_view_to_schema_mutation(base, timestamp, true, mutations);
    add_table_or_view_to_schema_mutation(new_view, timestamp, false, mutations);
    make_update_columns_mutations(old_view, new_view, timestamp, false, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

-std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp) {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+future<std::vector<mutation>> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp) {
+    std::vector<mutation> mutations;
    make_drop_table_or_view_mutations(views(), view, timestamp, mutations);
-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 #if 0
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -80,6 +80,7 @@ future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<ser

 future<schema_result_value_type>
 read_schema_partition_for_keyspace(distributed<service::storage_proxy>& proxy, const sstring& schema_table_name, const sstring& keyspace_name);
+future<mutation> read_keyspace_mutation(distributed<service::storage_proxy>&, const sstring& keyspace_name);

 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations);

@@ -95,17 +96,17 @@ std::vector<mutation> make_drop_keyspace_mutations(lw_shared_ptr<keyspace_metada

 lw_shared_ptr<keyspace_metadata> create_keyspace_from_schema_partition(const schema_result_value_type& partition);

-std::vector<mutation> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);

 std::vector<user_type> create_types_from_schema_partition(const schema_result_value_type& result);

-std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);

 void add_type_to_schema_mutation(user_type type, api::timestamp_type timestamp, std::vector<mutation>& mutations);

-std::vector<mutation> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);

-std::vector<mutation> make_update_table_mutations(
+future<std::vector<mutation>> make_update_table_mutations(
    lw_shared_ptr<keyspace_metadata> keyspace,
    schema_ptr old_table,
    schema_ptr new_table,
@@ -114,7 +115,7 @@ std::vector<mutation> make_update_table_mutations(

 future<std::map<sstring, schema_ptr>> create_tables_from_tables_partition(distributed<service::storage_proxy>& proxy, const schema_result::mapped_type& result);

-std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);

 future<schema_ptr> create_table_from_name(distributed<service::storage_proxy>& proxy, const sstring& keyspace, const sstring& table);

@@ -149,11 +150,11 @@ schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timesta

 void add_table_or_view_to_schema_mutation(schema_ptr view, api::timestamp_type timestamp, bool with_columns, std::vector<mutation>& mutations);

-std::vector<mutation> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);

-std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr old_view, view_ptr new_view, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr old_view, view_ptr new_view, api::timestamp_type timestamp);

-std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);

 sstring serialize_kind(column_kind kind);
 column_kind deserialize_kind(sstring kind);
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/modprobe.d/scylla-raid0.conf
+++ b/dist/common/modprobe.d/scylla-raid0.conf
@@ -0,0 +1 @@
+options raid0 devices_discard_performance=Y
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -5,15 +5,20 @@
 . /usr/lib/scylla/scylla_lib.sh

 print_usage() {
-    echo "scylla-raid-setup --disks /dev/hda,/dev/hdb... --raiddev /dev/md0 --update-fstab"
+    echo "scylla-raid-setup --disks /dev/hda,/dev/hdb... --raiddev /dev/md0 --update-fstab --root /var/lib/scylla --volume-role [all|data|commitlog]"
    echo "  --disks	specify disks for RAID"
    echo "  --raiddev	MD device name for RAID"
    echo "  --update-fstab update /etc/fstab for RAID"
+    echo "  --root specify the root of the tree"
+    echo "  --volume-role specify how will this device be used (data, commitlog, or all)"
    exit 1
 }

 RAID=/dev/md0
 FSTAB=0
+ROOT=/var/lib/scylla
+ROLE="all"
+
 while [ $# -gt 0 ]; do
    case "$1" in
        "--disks")
@@ -29,12 +34,37 @@ while [ $# -gt 0 ]; do
            FSTAB=1
            shift 1
            ;;
+        "--root")
+            ROOT="$2"
+            shift 2
+            ;;
+        "--volume-role")
+            ROLE="$2"
+            shift 2
+            ;;
        *)
            print_usage
            ;;
    esac
 done

+ROOT=${ROOT%/}
+case "$ROLE" in
+    "all")
+        MOUNT_AT=$ROOT
+        ;;
+    "data")
+        MOUNT_AT="$ROOT/data"
+        ;;
+    "commitlog")
+        MOUNT_AT="$ROOT/commitlog"
+        ;;
+    *)
+        echo "Invalid role specified ($ROLE)"
+        print_usage
+        ;;
+esac
+
 if [ "$DISKS" = "" ]; then
    print_usage
 fi
@@ -51,8 +81,8 @@ if [ -e $RAID ]; then
    echo "$RAID is already using"
    exit 1
 fi
-if [ "`mount|grep /var/lib/scylla`" != "" ]; then
-    echo "/var/lib/scylla is already mounted"
+if mountpoint -q $MOUNT_AT; then
+    echo "$MOUNT_AT is already mounted"
    exit 1
 fi

@@ -61,18 +91,32 @@ if is_debian_variant; then
 else
    yum -y install mdadm xfsprogs
 fi
-mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
-mkfs.xfs $RAID -f
-echo "DEVICE $DISKS" > /etc/mdadm.conf
-mdadm --detail --scan >> /etc/mdadm.conf
+if [ "$ID" = "ubuntu" ] && [ "$VERSION_ID" = "14.04" ]; then
+    mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
+    mkfs.xfs $RAID -f
+else
+    for dsk in $DISKS; do
+        blkdiscard $dsk &
+    done
+    wait
+    mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
+    mkfs.xfs $RAID -f -K
+fi
+mdadm --detail --scan > /etc/mdadm.conf
+
+mkdir -p "$MOUNT_AT"
+mount -t xfs -o noatime $RAID "$MOUNT_AT"
+
+# create this unconditionally so we are more robust about ordering
+# if the script is run multiple times. But must do after mount in case
+# we are mounting the root
+mkdir -p "$ROOT/data"
+mkdir -p "$ROOT/commitlog"
+mkdir -p "$ROOT/coredump"
+chown scylla:scylla "$ROOT"
+chown scylla:scylla "$ROOT"/*
+
 if [ $FSTAB -ne 0 ]; then
    UUID=`blkid $RAID | awk '{print $2}'`
-    echo "$UUID /var/lib/scylla xfs noatime 0 0" >> /etc/fstab
+    echo "$UUID $MOUNT_AT xfs noatime 0 0" >> /etc/fstab
 fi
-mount -t xfs -o noatime $RAID /var/lib/scylla
-
-mkdir -p /var/lib/scylla/data
-mkdir -p /var/lib/scylla/commitlog
-mkdir -p /var/lib/scylla/coredump
-chown scylla:scylla /var/lib/scylla/*
-chown scylla:scylla /var/lib/scylla/
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -81,7 +81,7 @@ verify_package() {
 }

 list_block_devices() {
-    if lsblk --help | grep -q -e -p; then
+    if lsblk --help | grep -q -e '^\s*-p'; then
        lsblk -pnr | awk '{ print $1 }'
    else
        ls -1 /dev/sd* /dev/hd* /dev/xvd* /dev/nvme* /dev/mapper/*  2>/dev/null|grep -v control
@@ -267,15 +267,18 @@ if [ $ENABLE_SERVICE -eq 1 ]; then
           printf "[housekeeping]\ncheck-version: True\n" > /etc/scylla.d/housekeeping.cfg
        fi
        if is_systemd; then
-            systemctl unmask scylla-housekeeping.timer
+            systemctl unmask scylla-housekeeping-daily.timer
+            systemctl unmask scylla-housekeeping-restart.timer
        fi
    else
        if [ ! -f /etc/scylla.d/housekeeping.cfg ]; then
           printf "[housekeeping]\ncheck-version: False\n" > /etc/scylla.d/housekeeping.cfg
        fi
        if is_systemd; then
-            systemctl mask scylla-housekeeping.timer
-            systemctl stop scylla-housekeeping.timer || true
+            systemctl mask scylla-housekeeping-daily.timer
+            systemctl mask scylla-housekeeping-restart.timer
+            systemctl stop scylla-housekeeping-daily.timer || true
+            systemctl stop scylla-housekeeping-restart.timer || true
        fi
    fi
 fi
--- a/dist/common/systemd/scylla-housekeeping-daily.service
+++ b/dist/common/systemd/scylla-housekeeping-daily.service
@@ -1,5 +1,5 @@
 [Unit]
-Description=Scylla Housekeeping
+Description=Scylla Housekeeping daily mode
 After=network.target

 [Service]
--- a/dist/common/systemd/scylla-housekeeping-daily.timer
+++ b/dist/common/systemd/scylla-housekeeping-daily.timer
@@ -0,0 +1,11 @@
+[Unit]
+Description=Run Scylla Housekeeping daily mode
+After=scylla-server.service
+BindsTo=scylla-server.service
+
+[Timer]
+OnActiveSec=1d
+OnUnitActiveSec=1d
+
+[Install]
+WantedBy=timers.target
--- a/dist/common/systemd/scylla-housekeeping-restart.service
+++ b/dist/common/systemd/scylla-housekeeping-restart.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=Scylla Housekeeping restart mode
+After=network.target
+
+[Service]
+Type=simple
+User=scylla
+Group=scylla
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg version --mode r
+
+[Install]
+WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-housekeeping-restart.timer
+++ b/dist/common/systemd/scylla-housekeeping-restart.timer
@@ -1,12 +1,11 @@
 [Unit]
-Description=Run Scylla Housekeeping daily
+Description=Run Scylla Housekeeping restart mode
 After=scylla-server.service
 BindsTo=scylla-server.service

 [Timer]
 # set OnActiveSec to 3 to safely avoid issues/1846
 OnActiveSec=3
-OnUnitActiveSec=1d

 [Install]
 WantedBy=timers.target
--- a/dist/common/systemd/scylla-server.service.in
+++ b/dist/common/systemd/scylla-server.service.in
@@ -2,7 +2,8 @@
 Description=Scylla Server
 After=network.target
 Wants=scylla-jmx.service
-Wants=scylla-housekeeping.timer
+Wants=scylla-housekeeping-restart.timer
+Wants=scylla-housekeeping-daily.timer

 [Service]
 PermissionsStartOnly=true
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -84,7 +84,8 @@ if [ "$DISTRIBUTION" = "Debian" ]; then
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++-5, libunwind-dev/g" debian/control
    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
-    sed -i -e "s#@@HKDOTTIMER@@#dist/common/systemd/scylla-housekeeping.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_R@@#dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system#g" debian/scylla-server.install
    sed -i -e "s#@@SYSCTL@@#dist/debian/sysctl.d/99-scylla.conf etc/sysctl.d#g" debian/scylla-server.install
 elif [ "$VERSION_ID" = "14.04" ]; then
    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
@@ -92,7 +93,8 @@ elif [ "$VERSION_ID" = "14.04" ]; then
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/g++-5, libunwind8-dev/g" debian/control
    sed -i -e "s#@@INSTALL@@#dist/debian/sudoers.d/scylla etc/sudoers.d#g" debian/scylla-server.install
-    sed -i -e "s#@@HKDOTTIMER@@##g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_D@@##g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_R@@##g" debian/scylla-server.install
    sed -i -e "s#@@SYSCTL@@#dist/debian/sysctl.d/99-scylla.conf etc/sysctl.d#g" debian/scylla-server.install
 else
    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
@@ -100,7 +102,8 @@ else
    sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind-dev/g" debian/control
    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
-    sed -i -e "s#@@HKDOTTIMER@@#dist/common/systemd/scylla-housekeeping.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_R@@#dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system#g" debian/scylla-server.install
    sed -i -e "s#@@SYSCTL@@##g" debian/scylla-server.install
 fi
 if [ $DIST -gt 0 ]; then
@@ -116,7 +119,8 @@ fi

 cp dist/common/systemd/scylla-server.service.in debian/scylla-server.service
 sed -i -e "s#@@SYSCONFDIR@@#/etc/default#g" debian/scylla-server.service
-cp dist/common/systemd/scylla-housekeeping.service debian/scylla-server.scylla-housekeeping.service
+cp dist/common/systemd/scylla-housekeeping-daily.service debian/scylla-server.scylla-housekeeping-daily.service
+cp dist/common/systemd/scylla-housekeeping-restart.service debian/scylla-server.scylla-housekeeping-restart.service
 cp dist/common/systemd/node-exporter.service debian/scylla-server.node-exporter.service

 if [ "$VERSION_ID" = "14.04" ] && [ $REBUILD -eq 0 ]; then
--- a/dist/debian/control.in
+++ b/dist/debian/control.in
@@ -4,7 +4,7 @@ Homepage: http://scylladb.com
 Section: database
 Priority: optional
 Standards-Version: 3.9.5
-Build-Depends: debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, libthrift-dev, thrift-compiler, antlr3, antlr3-c++-dev, ragel, ninja-build, git, libboost-program-options1.55-dev | libboost-program-options-dev, libboost-filesystem1.55-dev | libboost-filesystem-dev, libboost-system1.55-dev | libboost-system-dev, libboost-thread1.55-dev | libboost-thread-dev, libboost-test1.55-dev | libboost-test-dev, libgnutls28-dev, libhwloc-dev, libnuma-dev, libpciaccess-dev, xfslibs-dev, python3-pyparsing, libxml2-dev, libsctp-dev, python-urwid, pciutils, libprotobuf-dev, protobuf-compiler, systemtap-sdt-dev, @@BUILD_DEPENDS@@
+Build-Depends: debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, libthrift-dev, thrift-compiler, antlr3, antlr3-c++-dev, ragel, ninja-build, git, libboost-program-options1.55-dev | libboost-program-options-dev, libboost-filesystem1.55-dev | libboost-filesystem-dev, libboost-system1.55-dev | libboost-system-dev, libboost-thread1.55-dev | libboost-thread-dev, libboost-test1.55-dev | libboost-test-dev, libgnutls28-dev, libhwloc-dev, libnuma-dev, libpciaccess-dev, xfslibs-dev, python3-pyparsing, libxml2-dev, libsctp-dev, python-urwid, pciutils, libprotobuf-dev, protobuf-compiler, systemtap-sdt-dev, libtool, automake, @@BUILD_DEPENDS@@

 Package: scylla-conf
 Architecture: any
--- a/dist/debian/dep/build_dependency.sh
+++ b/dist/debian/dep/build_dependency.sh
@@ -77,10 +77,11 @@ fi

 if [ "$DISTRIBUTION" = "Debian" ] && [ "$VERSION_ID" = "8" ]; then
    if [ ! -f build/gcc-5_*.deb ]; then
-        sudo cp dist/debian/dep/debian-stretch-source.list /etc/apt/sources.list.d/
-        sudo apt-get update
        cd build
-        apt-get source gcc-5/stretch=5.4.1-2
+        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1-5.dsc
+        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1.orig.tar.gz
+        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1-5.diff.gz
+        dpkg-source -x gcc-5_5.4.1-5.dsc
        cd gcc-5-5.4.1
        # resolve build time dependencies manually, since mk-build-deps doesn't works for gcc package
        sudo apt-get install -y g++-multilib libc6-dev-i386 lib32gcc1 libc6-dev-x32 libx32gcc1 libc6-dbg m4 libtool autoconf2.64 autogen gawk zlib1g-dev systemtap-sdt-dev gperf bison flex gdb texinfo locales sharutils libantlr-java libffi-dev gnat-4.9 libisl-dev libmpc-dev libmpfr-dev libgmp-dev dejagnu realpath chrpath quilt doxygen graphviz ghostscript texlive-latex-base xsltproc libxml2-utils docbook-xsl-ns
--- a/dist/debian/dep/debian-gcc-5-jessie.diff
+++ b/dist/debian/dep/debian-gcc-5-jessie.diff
@@ -1,6 +1,5 @@
-diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
--- debian/rules.conf	2016-10-14 04:54:21.000000000 +0000
-+++ /home/syuu/gcc-5-5.4.1/debian/rules.conf	2016-10-12 17:28:54.138711378 +0000
+--- debian/rules.conf	2017-02-24 19:02:52.000000000 +0000
+++ /home/syuu/gcc-5.5/gcc-5-5.4.1/debian/rules.conf	2017-02-24 18:13:59.000000000 +0000
@@ -206,7 +206,7 @@
   ifneq (,$(filter $(distrelease),vivid))
     BINUTILSBDV = 2.25-3~
@@ -10,14 +9,16 @@ diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
   else ifneq (,$(filter $(distrelease),sid stretch xenial))
     BINUTILSBDV = 2.26.1
   endif
-@@ -387,9 +387,9 @@
+@@ -386,10 +386,10 @@
+   MPFR_BUILD_DEP = libmpfr-dev (>= 3.0.0-9~),
 endif
 
- ISL_BUILD_DEP = libisl-dev,
-ifneq (,$(filter $(distrelease),jessie sid experimental))
+-ISL_BUILD_DEP = libisl-dev,
+-ifneq (,$(filter $(distrelease),jessie stretch sid experimental))
 -  ISL_BUILD_DEP = libisl-dev (>= 0.14),
 -endif
-+#ifneq (,$(filter $(distrelease),jessie sid experimental))
+#ISL_BUILD_DEP = libisl-dev,
+#ifneq (,$(filter $(distrelease),jessie stretch sid experimental))
 +#  ISL_BUILD_DEP = libisl-dev (>= 0.14),
 +#endif
 
@@ -37,9 +38,8 @@ diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
 ifneq ($(DEB_CROSS),yes)
 # all archs for which to create b-d's
 any_archs = alpha amd64 armel armhf arm64 i386 mips mipsel mips64 mips64el powerpc ppc64 ppc64el m68k sh4 sparc64 s390x x32
-diff -Nur debian/rules.defs /home/syuu/gcc-5-5.4.1/debian/rules.defs
--- debian/rules.defs	2016-10-14 04:54:21.000000000 +0000
-+++ /home/syuu/gcc-5-5.4.1/debian/rules.defs	2016-10-13 10:18:51.647631508 +0000
+--- debian/rules.defs	2017-02-24 19:02:52.000000000 +0000
+++ /home/syuu/gcc-5.5/gcc-5-5.4.1/debian/rules.defs	2017-02-24 18:13:59.000000000 +0000
@@ -412,7 +412,7 @@
 # gcc versions (fixincludes, libgcj-common) ...
 #with_common_pkgs := yes
--- a/dist/debian/dep/debian-stretch-source.list
+++ b/dist/debian/dep/debian-stretch-source.list
@@ -1,2 +0,0 @@
-deb-src http://httpredir.debian.org/debian stretch main
-deb-src http://httpredir.debian.org/debian stretch-updates main
--- a/dist/debian/rules.in
+++ b/dist/debian/rules.in
@@ -11,7 +11,8 @@ override_dh_auto_clean:

 override_dh_installinit:
 	dh_installinit --no-start @@DH_INSTALLINIT@@
-	dh_installinit --no-start --name scylla-housekeeping @@DH_INSTALLINIT@@
+	dh_installinit --no-start --name scylla-housekeeping-daily @@DH_INSTALLINIT@@
+	dh_installinit --no-start --name scylla-housekeeping-restart @@DH_INSTALLINIT@@
 	dh_installinit --no-start --name node-exporter @@DH_INSTALLINIT@@

 override_dh_strip:
--- a/dist/debian/scylla-server.install.in
+++ b/dist/debian/scylla-server.install.in
@@ -15,6 +15,7 @@ build/release/iotune usr/bin
 dist/common/bin/scyllatop usr/bin
 dist/common/sbin/* usr/sbin
@@ADDHKCFG@@
-@@HKDOTTIMER@@
+@@HKDOTTIMER_D@@
+@@HKDOTTIMER_R@@
@@INSTALL@@
@@SYSCTL@@
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -7,7 +7,7 @@ ENV container docker
 VOLUME [ "/sys/fs/cgroup" ]

 #install scylla
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-1.7.repo -o /etc/yum.repos.d/scylla.repo
 RUN yum -y install epel-release
 RUN yum -y clean expire-cache
 RUN yum -y update
@@ -38,6 +38,6 @@ ADD commandlineparser.py /commandlineparser.py
 ADD docker-entrypoint.py /docker-entrypoint.py
 ENTRYPOINT ["/docker-entrypoint.py"]

-EXPOSE 10000 9042 9160 7000 7001
+EXPOSE 10000 9042 9160 9180 7000 7001
 VOLUME [ "/var/lib/scylla" ]
 RUN chown -R scylla.scylla /var/lib/scylla
--- a/dist/redhat/centos_dep/build_dependency.sh
+++ b/dist/redhat/centos_dep/build_dependency.sh
@@ -28,10 +28,6 @@ if [ ! -f boost-1.58.0-11.fc23.src.rpm ]; then
    wget -nv https://kojipkgs.fedoraproject.org//packages/boost/1.58.0/11.fc23/src/boost-1.58.0-11.fc23.src.rpm
 fi

-if [ ! -f ninja-build-1.6.0-2.fc23.src.rpm ]; then
-    wget -nv https://kojipkgs.fedoraproject.org//packages/ninja-build/1.6.0/2.fc23/src/ninja-build-1.6.0-2.fc23.src.rpm
-fi
-
 if [ ! -f ragel-6.8-5.fc23.src.rpm ]; then
   wget -nv https://kojipkgs.fedoraproject.org//packages/ragel/6.8/5.fc23/src/ragel-6.8-5.fc23.src.rpm
 fi
@@ -94,13 +90,6 @@ if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-boost-1.58.0-11.el7*.x86_64.rpm ]; then
 fi
 do_install scylla-boost*

-if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-ninja-build-1.6.0-2.el7*.x86_64.rpm ]; then
-   rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ninja-build-1.6.0-2.fc23.src.rpm
-   patch $RPMBUILD/SPECS/ninja-build.spec < dist/redhat/centos_dep/ninja-build.diff
-   rpmbuild --define "_topdir $RPMBUILD" -ba $RPMBUILD/SPECS/ninja-build.spec
-fi
-do_install scylla-ninja-build-1.6.0-2.el7*.x86_64.rpm
-
 if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-ragel-6.8-5.el7*.x86_64.rpm ]; then
    rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ragel-6.8-5.fc23.src.rpm
    patch $RPMBUILD/SPECS/ragel.spec < dist/redhat/centos_dep/ragel.diff
--- a/dist/redhat/centos_dep/ninja-build.diff
+++ b/dist/redhat/centos_dep/ninja-build.diff
@@ -1,56 +0,0 @@
--- ninja-build.spec.orig	2016-01-20 14:41:16.892802134 +0000
-+++ ninja-build.spec	2016-01-20 14:44:42.453227192 +0000
-@@ -1,19 +1,18 @@
-Name:           ninja-build
-+Name:           scylla-ninja-build
- Version:        1.6.0
- Release:        2%{?dist}
- Summary:        A small build system with a focus on speed
- License:        ASL 2.0
- URL:            http://martine.github.com/ninja/
- Source0:        https://github.com/martine/ninja/archive/v%{version}.tar.gz#/ninja-%{version}.tar.gz
-Source1:        ninja.vim
- # Rename mentions of the executable name to be ninja-build.
- Patch1000:      ninja-1.6.0-binary-rename.patch
-+Requires:	scylla-env
- BuildRequires:  asciidoc
- BuildRequires:  gtest-devel
- BuildRequires:  python2-devel
-BuildRequires:  re2c >= 0.11.3
-Requires:       emacs-filesystem
-Requires:       vim-filesystem
-+#BuildRequires:  scylla-re2c >= 0.11.3
-+%define _prefix /opt/scylladb
- 
- %description
- Ninja is a small build system with a focus on speed. It differs from other
-@@ -32,15 +31,8 @@
- ./ninja -v ninja_test
- 
- %install
-# TODO: Install ninja_syntax.py?
-mkdir -p %{buildroot}/{%{_bindir},%{_datadir}/bash-completion/completions,%{_datadir}/emacs/site-lisp,%{_datadir}/vim/vimfiles/syntax,%{_datadir}/vim/vimfiles/ftdetect,%{_datadir}/zsh/site-functions}
-
-+mkdir -p %{buildroot}/opt/scylladb/bin
- install -pm755 ninja %{buildroot}%{_bindir}/ninja-build
-install -pm644 misc/bash-completion %{buildroot}%{_datadir}/bash-completion/completions/ninja-bash-completion
-install -pm644 misc/ninja-mode.el %{buildroot}%{_datadir}/emacs/site-lisp/ninja-mode.el
-install -pm644 misc/ninja.vim %{buildroot}%{_datadir}/vim/vimfiles/syntax/ninja.vim
-install -pm644 %{SOURCE1} %{buildroot}%{_datadir}/vim/vimfiles/ftdetect/ninja.vim
-install -pm644 misc/zsh-completion %{buildroot}%{_datadir}/zsh/site-functions/_ninja
- 
- %check
- # workaround possible too low default limits
-@@ -50,12 +42,6 @@
- %files
- %doc COPYING HACKING.md README doc/manual.html
- %{_bindir}/ninja-build
-%{_datadir}/bash-completion/completions/ninja-bash-completion
-%{_datadir}/emacs/site-lisp/ninja-mode.el
-%{_datadir}/vim/vimfiles/syntax/ninja.vim
-%{_datadir}/vim/vimfiles/ftdetect/ninja.vim
-# zsh does not have a -filesystem package
-%{_datadir}/zsh/
- 
- %changelog
- * Mon Nov 16 2015 Ben Boeckel <mathstuf@gmail.com> - 1.6.0-2
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -27,9 +27,9 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler libunwind-devel systemtap-sdt-devel
-%{?fedora:BuildRequires: boost-devel ninja-build ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
-%{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-boost-static scylla-ninja-build scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
+BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler libunwind-devel systemtap-sdt-devel libtool automake ninja-build
+%{?fedora:BuildRequires: boost-devel ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
+%{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-boost-static scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
 Requires:       scylla-conf systemd-libs hwloc collectd PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils
 %{?rhel:Requires: python34 python34-PyYAML}
 Conflicts:      abrt
@@ -63,6 +63,9 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
+%if 0%{?rhel}
+mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
+%endif
 mkdir -p $RPM_BUILD_ROOT%{_sysctldir}/
 mkdir -p $RPM_BUILD_ROOT%{_docdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_unitdir}
@@ -73,6 +76,9 @@ install -m644 dist/common/limits.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/sec
 install -m644 dist/common/collectd.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 install -m644 dist/common/scylla.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
 install -m644 dist/common/sysctl.d/*.conf $RPM_BUILD_ROOT%{_sysctldir}/
+%if 0%{?rhel}
+install -m644 dist/common/modprobe.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
+%endif
 install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
 install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
@@ -151,10 +157,8 @@ rm -rf $RPM_BUILD_ROOT
 %{_docdir}/scylla/NOTICE.txt
 %{_docdir}/scylla/ORIGIN
 %{_docdir}/scylla/licenses/
-%{_unitdir}/scylla-server.service
-%{_unitdir}/scylla-housekeeping.service
-%{_unitdir}/scylla-housekeeping.timer
-%{_unitdir}/node-exporter.service
+%{_unitdir}/*.service
+%{_unitdir}/*.timer
 %{_bindir}/scylla
 %{_bindir}/iotune
 %{_bindir}/scyllatop
@@ -228,6 +232,7 @@ Group:          Applications/Databases
 Summary:        Scylla configuration package for the Linux kernel
 License:        AGPLv3
 URL:            http://www.scylladb.com/
+Requires:       kmod

 %description kernel-conf
 This package contains Linux kernel configuration changes for the Scylla database.  Install this package
@@ -237,9 +242,18 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
+# Write modprobe.d params when module already loaded
+%if 0%{?rhel}
+if [ -e /sys/module/raid0/parameters/devices_discard_performance ]; then
+    echo Y > /sys/module/raid0/parameters/devices_discard_performance
+fi
+%endif

 %files kernel-conf
 %defattr(-,root,root)
+%if 0%{?rhel}
+%config(noreplace) %{_sysconfdir}/modprobe.d/*.conf
+%endif
 %{_sysctldir}/*.conf

 %changelog
--- a/gc_clock.hh
+++ b/gc_clock.hh
@@ -50,6 +50,12 @@ public:
    // for real time waits.
 };

+// Returns a time point which is earlier from t by d, or minimum time point if it cannot be represented.
+template<typename Clock, typename Duration, typename Rep, typename Period>
+inline
+auto saturating_subtract(std::chrono::time_point<Clock, Duration> t, std::chrono::duration<Rep, Period> d) -> decltype(t) {
+    return std::max(t, decltype(t)::min() + d) - d;
+}

 using expiry_opt = std::experimental::optional<gc_clock::time_point>;
 using ttl_opt = std::experimental::optional<gc_clock::duration>;
--- a/memtable.cc
+++ b/memtable.cc
@@ -65,17 +65,15 @@ future<> memtable::clear_gently() noexcept {
        auto t = std::make_unique<seastar::thread>(attr, [this] {
            auto& alloc = allocator();

-            // entries can no longer be moved after unlink_leftmost_without_rebalance()
-            // so need to disable compaction.
-            logalloc::reclaim_lock rl(*this);
-
            auto p = std::move(partitions);
            while (!p.empty()) {
                auto batch_size = std::min<size_t>(p.size(), 32);
                auto dirty_before = dirty_size();
                with_allocator(alloc, [&] () noexcept {
                    while (batch_size--) {
-                        alloc.destroy(p.unlink_leftmost_without_rebalance());
+                        p.erase_and_dispose(p.begin(), [&] (auto e) {
+                            alloc.destroy(e);
+                        });
                    }
                });
                remove_flushed_memory(dirty_before - dirty_size());
@@ -205,19 +203,23 @@ protected:
        , _range(&range)
    { }

-    memtable_entry* fetch_next_entry() {
+    memtable_entry* fetch_entry() {
        update_iterators();
        if (_i == _end) {
            return nullptr;
        } else {
            memtable_entry& e = *_i;
-            ++_i;
-            _last = e.key();
            _memtable->upgrade_entry(e);
            return &e;
        }
    }

+    void advance() {
+        memtable_entry& e = *_i;
+        _last = e.key();
+        ++_i;
+    }
+
    logalloc::allocating_section& read_section() {
        return _memtable->_read_section;
    }
@@ -287,14 +289,18 @@ public:
            return _delegate();
        }

-        logalloc::reclaim_lock _(region());
-        managed_bytes::linearization_context_guard lcg;
-        memtable_entry* e = fetch_next_entry();
-        if (!e) {
-             return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
-        } else {
-            return make_ready_future<streamed_mutation_opt>(e->read(mtbl(), schema(), _slice));
-        }
+        return read_section()(region(), [&] {
+            return with_linearized_managed_bytes([&] {
+                memtable_entry* e = fetch_entry();
+                if (!e) {
+                    return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
+                } else {
+                    auto ret =  make_ready_future<streamed_mutation_opt>(e->read(mtbl(), schema(), _slice));
+                    advance();
+                    return ret;
+                }
+            });
+        });
    }
 };

@@ -391,19 +397,24 @@ public:
    flush_reader& operator=(const flush_reader&) = delete;

    virtual future<streamed_mutation_opt> operator()() override {
-        logalloc::reclaim_lock _(region());
-        managed_bytes::linearization_context_guard lcg;
-        memtable_entry* e = fetch_next_entry();
-        if (!e) {
-            return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
-        } else {
-            auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), query::full_slice, e->key().key());
-            auto snp = e->partition().read(schema());
-            auto mpsr = make_partition_snapshot_reader<partition_snapshot_accounter>(schema(), e->key(), std::move(cr), snp, region(), read_section(), mtbl(), _flushed_memory);
-            _flushed_memory.account_component(*e);
-            _flushed_memory.account_component(*snp);
-            return make_ready_future<streamed_mutation_opt>(std::move(mpsr));
-        }
+        return read_section()(region(), [&] {
+            return with_linearized_managed_bytes([&] {
+                memtable_entry* e = fetch_entry();
+                if (!e) {
+                    return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
+                } else {
+                    auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), query::full_slice, e->key().key());
+                    auto snp = e->partition().read(schema());
+                    auto mpsr = make_partition_snapshot_reader<partition_snapshot_accounter>(schema(), e->key(), std::move(cr),
+                            snp, region(), read_section(), mtbl(), _flushed_memory);
+                    _flushed_memory.account_component(*e);
+                    _flushed_memory.account_component(*snp);
+                    auto ret = make_ready_future<streamed_mutation_opt>(std::move(mpsr));
+                    advance();
+                    return ret;
+                }
+            });
+        });
    }
 };

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -274,7 +274,13 @@ void messaging_service::start_listen() {
        if (listen_to_bc) {
            _server_tls[1] = listen(utils::fb_utilities::get_broadcast_address());
        }
-
+    }
+    // Do this on just cpu 0, to avoid duplicate logs.
+    if (engine().cpu_id() == 0) {
+        if (_server_tls[0]) {
+            logger.info("Starting Encrypted Messaging Service on SSL port {}", _ssl_port);
+        }
+        logger.info("Starting Messaging Service on port {}", _port);
    }
 }

@@ -308,14 +314,6 @@ messaging_service::messaging_service(gms::inet_address ip
    if (listen_now) {
        start_listen();
    }
-
-    // Do this on just cpu 0, to avoid duplicate logs.
-    if (engine().cpu_id() == 0) {
-        if (_server_tls[0]) {
-            logger.info("Starting Encrypted Messaging Service on SSL port {}", _ssl_port);
-        }
-        logger.info("Starting Messaging Service on port {}", _port);
-    }
 }

 msg_addr messaging_service::get_source(const rpc::client_info& cinfo) {
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -123,7 +123,7 @@ public:
              uint32_t partition_limit, CompactedMutationsConsumer consumer)
        : _schema(s)
        , _query_time(query_time)
-        , _gc_before(query_time - s.gc_grace_seconds())
+        , _gc_before(saturating_subtract(query_time, s.gc_grace_seconds()))
        , _can_gc(always_gc)
        , _slice(slice)
        , _row_limit(limit)
@@ -139,7 +139,7 @@ public:
                     std::function<api::timestamp_type(const dht::decorated_key&)> get_max_purgeable)
        : _schema(s)
        , _query_time(compaction_time)
-        , _gc_before(_query_time - s.gc_grace_seconds())
+        , _gc_before(saturating_subtract(_query_time, s.gc_grace_seconds()))
        , _get_max_purgeable(std::move(get_max_purgeable))
        , _can_gc([this] (tombstone t) { return can_gc(t); })
        , _slice(query::full_slice)
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -1183,7 +1183,7 @@ uint32_t mutation_partition::do_compact(const schema& s,
 {
    assert(row_limit > 0);

-    auto gc_before = query_time - s.gc_grace_seconds();
+    auto gc_before = saturating_subtract(query_time, s.gc_grace_seconds());

    auto should_purge_tombstone = [&] (const tombstone& t) {
        return t.deletion_time < gc_before && can_gc(t);
--- a/query-result.hh
+++ b/query-result.hh
@@ -345,7 +345,7 @@ public:
        : _w(std::move(w))
        , _row_count(c)
        , _short_read(sr)
-        , _memory_tracker(std::move(_memory_tracker))
+        , _memory_tracker(std::move(memory_tracker))
        , _partition_count(pc)
    {
        w.reduce_chunk_count();
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -492,6 +492,13 @@ static void split_and_add(std::vector<::dht::token_range>& ranges,
    auto midpoint = dht::global_partitioner().midpoint(
            range.start() ? range.start()->value() : dht::minimum_token(),
            range.end() ? range.end()->value() : dht::minimum_token());
+    // This shouldn't happen, but if the range included just one token, we
+    // can't split further (split() may actually fail with assertion failure)
+    if ((range.start() && midpoint == range.start()->value()) ||
+        (range.end() && midpoint == range.end()->value())) {
+        ranges.push_back(range);
+        return;
+    }
    auto halves = range.split(midpoint, dht::token_comparator());
    ranges.push_back(halves.first);
    ranges.push_back(halves.second);
--- a/schema.cc
+++ b/schema.cc
@@ -145,6 +145,20 @@ void schema::rebuild() {

    thrift()._compound = is_compound();
    thrift()._is_dynamic = clustering_key_size() > 0;
+
+    if (default_validator()->is_counter()) {
+        for (auto&& cdef : boost::range::join(static_columns(), regular_columns())) {
+            if (!cdef.type->is_counter()) {
+                throw exceptions::configuration_exception(sprint("Cannot add a non counter column (%s) in a counter column family", cdef.name_as_text()));
+            }
+        }
+    } else {
+        for (auto&& cdef : all_columns()) {
+            if (cdef.second->type->is_counter()) {
+                throw exceptions::configuration_exception(sprint("Cannot add a counter column (%s) in a non counter column family", cdef.second->name_as_text()));
+            }
+        }
+    }
 }

 const column_mapping& schema::get_column_mapping() const {
--- a/2
+++ b/2
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -481,8 +481,10 @@ future<> migration_manager::announce_new_column_family(schema_ptr cfm, bool anno
            throw exceptions::already_exists_exception(cfm->ks_name(), cfm->cf_name());
        }
        logger.info("Create new ColumnFamily: {}", cfm);
-        auto mutations = db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, api::new_timestamp())
+            .then([announce_locally, this] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_keyspace& e) {
        throw exceptions::configuration_exception(sprint("Cannot add table '%s' to non existing keyspace '%s'.", cfm->cf_name(), cfm->ks_name()));
    }
@@ -501,8 +503,10 @@ future<> migration_manager::announce_column_family_update(schema_ptr cfm, bool f
 #endif
        logger.info("Update table '{}.{}' From {} To {}", cfm->ks_name(), cfm->cf_name(), *old_schema, *cfm);
        auto&& keyspace = db.find_keyspace(cfm->ks_name());
-        auto mutations = db::schema_tables::make_update_table_mutations(keyspace.metadata(), old_schema, cfm, api::new_timestamp(), from_thrift);
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_update_table_mutations(keyspace.metadata(), old_schema, cfm, api::new_timestamp(), from_thrift)
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_column_family& e) {
        throw exceptions::configuration_exception(sprint("Cannot update non existing table '%s' in keyspace '%s'.",
                                                         cfm->cf_name(), cfm->ks_name()));
@@ -512,8 +516,10 @@ future<> migration_manager::announce_column_family_update(schema_ptr cfm, bool f
 static future<> do_announce_new_type(user_type new_type, bool announce_locally) {
    auto& db = get_local_storage_proxy().get_db().local();
    auto&& keyspace = db.find_keyspace(new_type->_keyspace);
-    auto mutations = db::schema_tables::make_create_type_mutations(keyspace.metadata(), new_type, api::new_timestamp());
-    return migration_manager::announce(std::move(mutations), announce_locally);
+    return db::schema_tables::make_create_type_mutations(keyspace.metadata(), new_type, api::new_timestamp())
+        .then([announce_locally] (auto&& mutations) {
+            return migration_manager::announce(std::move(mutations), announce_locally);
+        });
 }

 future<> migration_manager::announce_new_type(user_type new_type, bool announce_locally) {
@@ -609,8 +615,10 @@ future<> migration_manager::announce_column_family_drop(const sstring& ks_name,
                        ks_name, ::join(", ", views | boost::adaptors::transformed([](auto&& v) { return v->cf_name(); }))));
        }
        logger.info("Drop table '{}.{}'", schema->ks_name(), schema->cf_name());
-        auto mutations = db::schema_tables::make_drop_table_mutations(db.find_keyspace(ks_name).metadata(), schema, api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_drop_table_mutations(db.find_keyspace(ks_name).metadata(), schema, api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_column_family& e) {
        throw exceptions::configuration_exception(sprint("Cannot drop non existing table '%s' in keyspace '%s'.", cf_name, ks_name));
    }
@@ -621,8 +629,10 @@ future<> migration_manager::announce_type_drop(user_type dropped_type, bool anno
    auto& db = get_local_storage_proxy().get_db().local();
    auto&& keyspace = db.find_keyspace(dropped_type->_keyspace);
    logger.info("Drop User Type: {}", dropped_type->get_name_as_string());
-    auto mutations = db::schema_tables::make_drop_type_mutations(keyspace.metadata(), dropped_type, api::new_timestamp());
-    return announce(std::move(mutations), announce_locally);
+    return db::schema_tables::make_drop_type_mutations(keyspace.metadata(), dropped_type, api::new_timestamp())
+        .then([announce_locally] (auto&& mutations) {
+            return announce(std::move(mutations), announce_locally);
+        });
 }

 future<> migration_manager::announce_new_view(view_ptr view, bool announce_locally)
@@ -637,8 +647,10 @@ future<> migration_manager::announce_new_view(view_ptr view, bool announce_local
            throw exceptions::already_exists_exception(view->ks_name(), view->cf_name());
        }
        logger.info("Create new view: {}", view);
-        auto mutations = db::schema_tables::make_create_view_mutations(keyspace, std::move(view), api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_create_view_mutations(keyspace, std::move(view), api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_keyspace& e) {
        throw exceptions::configuration_exception(sprint("Cannot add view '%s' to non existing keyspace '%s'.", view->cf_name(), view->ks_name()));
    }
@@ -660,8 +672,10 @@ future<> migration_manager::announce_view_update(view_ptr view, bool announce_lo
        oldCfm.validateCompatility(cfm);
 #endif
        logger.info("Update view '{}.{}' From {} To {}", view->ks_name(), view->cf_name(), *old_view, *view);
-        auto mutations = db::schema_tables::make_update_view_mutations(std::move(keyspace), view_ptr(old_view), std::move(view), api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_update_view_mutations(std::move(keyspace), view_ptr(old_view), std::move(view), api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const std::out_of_range& e) {
        throw exceptions::configuration_exception(sprint("Cannot update non existing materialized view '%s' in keyspace '%s'.",
                                                         view->cf_name(), view->ks_name()));
@@ -680,8 +694,10 @@ future<> migration_manager::announce_view_drop(const sstring& ks_name,
        }
        auto keyspace = db.find_keyspace(ks_name).metadata();
        logger.info("Drop view '{}.{}'", view->ks_name(), view->cf_name());
-        auto mutations = db::schema_tables::make_drop_view_mutations(std::move(keyspace), view_ptr(std::move(view)), api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_drop_view_mutations(std::move(keyspace), view_ptr(std::move(view)), api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_column_family& e) {
        throw exceptions::configuration_exception(sprint("Cannot drop non existing materialized view '%s' in keyspace '%s'.",
                                                         cf_name, ks_name));
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -478,7 +478,6 @@ inline uint64_t& storage_proxy::split_stats::get_ep_stat(gms::inet_address ep) {
 storage_proxy::~storage_proxy() {}
 storage_proxy::storage_proxy(distributed<database>& db) : _db(db) {
    namespace sm = seastar::metrics;
-
    _metrics.add_group(COORDINATOR_STATS_CATEGORY, {
        sm::make_queue_length("foreground_writes", [this] { return _stats.writes - _stats.background_writes; },
                       sm::description("number of currently pending foreground write requests")),
@@ -486,7 +485,7 @@ storage_proxy::storage_proxy(distributed<database>& db) : _db(db) {
        sm::make_queue_length("background_writes", [this] { return _stats.background_writes; },
                       sm::description("number of currently pending background write requests")),

-        sm::make_queue_length("throttled_writes", [this] { return _throttled_writes.size(); },
+        sm::make_queue_length("current_throttled_writes", [this] { return _throttled_writes.size(); },
                       sm::description("number of currently throttled write requests")),

        sm::make_total_operations("throttled_writes", [this] { return _stats.throttled_writes; },
@@ -1733,14 +1732,14 @@ protected:
    size_t _targets_count;
    promise<> _done_promise; // all target responded
    bool _timedout = false; // will be true if request timeouts
-    timer<lowres_clock> _timeout;
+    timer<storage_proxy::clock_type> _timeout;
    size_t _responses = 0;
    schema_ptr _schema;

    virtual void on_timeout() {}
    virtual size_t response_count() const = 0;
 public:
-    abstract_read_resolver(schema_ptr schema, db::consistency_level cl, size_t target_count, lowres_clock::time_point timeout)
+    abstract_read_resolver(schema_ptr schema, db::consistency_level cl, size_t target_count, storage_proxy::clock_type::time_point timeout)
        : _cl(cl)
        , _targets_count(target_count)
        , _schema(std::move(schema))
@@ -1796,7 +1795,7 @@ class digest_read_resolver : public abstract_read_resolver {
        return _digest_results.size();
    }
 public:
-    digest_read_resolver(schema_ptr schema, db::consistency_level cl, size_t block_for, lowres_clock::time_point timeout) : abstract_read_resolver(std::move(schema), cl, 0, timeout), _block_for(block_for) {}
+    digest_read_resolver(schema_ptr schema, db::consistency_level cl, size_t block_for, storage_proxy::clock_type::time_point timeout) : abstract_read_resolver(std::move(schema), cl, 0, timeout), _block_for(block_for) {}
    void add_data(gms::inet_address from, foreign_ptr<lw_shared_ptr<query::result>> result) {
        if (!_timedout) {
            // if only one target was queried digest_check() will be skipped so we can also skip digest calculation
@@ -2143,7 +2142,7 @@ private:
        return false;
    }
 public:
-    data_read_resolver(schema_ptr schema, db::consistency_level cl, size_t targets_count, lowres_clock::time_point timeout) : abstract_read_resolver(std::move(schema), cl, targets_count, timeout) {
+    data_read_resolver(schema_ptr schema, db::consistency_level cl, size_t targets_count, storage_proxy::clock_type::time_point timeout) : abstract_read_resolver(std::move(schema), cl, targets_count, timeout) {
        _data_results.reserve(targets_count);
    }
    void add_mutate_data(gms::inet_address from, foreign_ptr<lw_shared_ptr<reconcilable_result>> result) {
@@ -2330,7 +2329,7 @@ protected:
    using targets_iterator = std::vector<gms::inet_address>::iterator;
    using digest_resolver_ptr = ::shared_ptr<digest_read_resolver>;
    using data_resolver_ptr = ::shared_ptr<data_read_resolver>;
-    using clock_type = lowres_clock;
+    using clock_type = storage_proxy::clock_type;

    schema_ptr _schema;
    shared_ptr<storage_proxy> _proxy;
@@ -2454,7 +2453,7 @@ protected:
    uint32_t original_partition_limit() const {
        return _cmd->partition_limit;
    }
-    void reconcile(db::consistency_level cl, lowres_clock::time_point timeout, lw_shared_ptr<query::read_command> cmd) {
+    void reconcile(db::consistency_level cl, storage_proxy::clock_type::time_point timeout, lw_shared_ptr<query::read_command> cmd) {
        data_resolver_ptr data_resolver = ::make_shared<data_read_resolver>(_schema, cl, _targets.size(), timeout);
        auto exec = shared_from_this();

@@ -2529,12 +2528,12 @@ protected:
            }
        });
    }
-    void reconcile(db::consistency_level cl, lowres_clock::time_point timeout) {
+    void reconcile(db::consistency_level cl, storage_proxy::clock_type::time_point timeout) {
        reconcile(cl, timeout, _cmd);
    }

 public:
-    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(lowres_clock::time_point timeout) {
+    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) {
        digest_resolver_ptr digest_resolver = ::make_shared<digest_read_resolver>(_schema, _cl, _block_for, timeout);
        auto exec = shared_from_this();

@@ -2604,7 +2603,7 @@ public:
 class always_speculating_read_executor : public abstract_read_executor {
 public:
    using abstract_read_executor::abstract_read_executor;
-    virtual future<> make_requests(digest_resolver_ptr resolver, lowres_clock::time_point timeout) {
+    virtual future<> make_requests(digest_resolver_ptr resolver, storage_proxy::clock_type::time_point timeout) {
        resolver->add_wait_targets(_targets.size());
        // FIXME: consider disabling for CL=*ONE
        bool want_digest = true;
@@ -2615,10 +2614,10 @@ public:

 // this executor sends request to an additional replica after some time below timeout
 class speculating_read_executor : public abstract_read_executor {
-    timer<> _speculate_timer;
+    timer<storage_proxy::clock_type> _speculate_timer;
 public:
    using abstract_read_executor::abstract_read_executor;
-    virtual future<> make_requests(digest_resolver_ptr resolver, lowres_clock::time_point timeout) {
+    virtual future<> make_requests(digest_resolver_ptr resolver, storage_proxy::clock_type::time_point timeout) {
        _speculate_timer.set_callback([this, resolver, timeout] {
            if (!resolver->is_completed()) { // at the time the callback runs request may be completed already
                resolver->add_wait_targets(1); // we send one more request so wait for it too
@@ -2664,7 +2663,7 @@ class range_slice_read_executor : public abstract_read_executor {
 public:
    range_slice_read_executor(schema_ptr s, shared_ptr<storage_proxy> proxy, lw_shared_ptr<query::read_command> cmd, dht::partition_range pr, db::consistency_level cl, std::vector<gms::inet_address> targets, tracing::trace_state_ptr trace_state) :
                                    abstract_read_executor(std::move(s), std::move(proxy), std::move(cmd), std::move(pr), cl, targets.size(), std::move(targets), std::move(trace_state)) {}
-    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(lowres_clock::time_point timeout) override {
+    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) override {
        reconcile(_cl, timeout);
        return _result_promise.get_future();
    }
@@ -2795,7 +2794,7 @@ future<foreign_ptr<lw_shared_ptr<query::result>>>
 storage_proxy::query_singular(lw_shared_ptr<query::read_command> cmd, dht::partition_range_vector&& partition_ranges, db::consistency_level cl, tracing::trace_state_ptr trace_state) {
    std::vector<::shared_ptr<abstract_read_executor>> exec;
    exec.reserve(partition_ranges.size());
-    auto timeout = lowres_clock::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());
+    auto timeout = storage_proxy::clock_type::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());

    for (auto&& pr: partition_ranges) {
        if (!pr.is_singular()) {
@@ -2819,7 +2818,7 @@ storage_proxy::query_singular(lw_shared_ptr<query::read_command> cmd, dht::parti
 }

 future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>>
-storage_proxy::query_partition_key_range_concurrent(lowres_clock::time_point timeout, std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results,
+storage_proxy::query_partition_key_range_concurrent(storage_proxy::clock_type::time_point timeout, std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results,
        lw_shared_ptr<query::read_command> cmd, db::consistency_level cl, dht::partition_range_vector::iterator&& i,
        dht::partition_range_vector&& ranges, int concurrency_factor, tracing::trace_state_ptr trace_state,
        uint32_t remaining_row_count, uint32_t remaining_partition_count) {
@@ -2923,7 +2922,7 @@ storage_proxy::query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
    schema_ptr schema = local_schema_registry().get(cmd->schema_version);
    keyspace& ks = _db.local().find_keyspace(schema->ks_name());
    dht::partition_range_vector ranges;
-    auto timeout = lowres_clock::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());
+    auto timeout = storage_proxy::clock_type::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());

    // when dealing with LocalStrategy keyspaces, we can skip the range splitting and merging (which can be
    // expensive in clusters with vnodes)
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -71,7 +71,7 @@ public:
 private:
    struct rh_entry {
        ::shared_ptr<abstract_write_response_handler> handler;
-        timer<lowres_clock> expire_timer;
+        timer<clock_type> expire_timer;
        rh_entry(::shared_ptr<abstract_write_response_handler>&& h, std::function<void()>&& cb);
    };

@@ -253,7 +253,7 @@ private:
    dht::partition_range_vector get_restricted_ranges(keyspace& ks, const schema& s, dht::partition_range range);
    float estimate_result_rows_per_range(lw_shared_ptr<query::read_command> cmd, keyspace& ks);
    static std::vector<gms::inet_address> intersection(const std::vector<gms::inet_address>& l1, const std::vector<gms::inet_address>& l2);
-    future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>> query_partition_key_range_concurrent(lowres_clock::time_point timeout,
+    future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>> query_partition_key_range_concurrent(clock_type::time_point timeout,
            std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results, lw_shared_ptr<query::read_command> cmd, db::consistency_level cl, dht::partition_range_vector::iterator&& i,
            dht::partition_range_vector&& ranges, int concurrency_factor, tracing::trace_state_ptr trace_state,
            uint32_t remaining_row_count, uint32_t remaining_partition_count);
--- a/sstables/compress.cc
+++ b/sstables/compress.cc
@@ -71,6 +71,12 @@ void compression::set_compressor(compressor c) {
     }
 }

+// locate() takes a byte position in the uncompressed stream, and finds the
+// the location of the compressed chunk on disk which contains it, and the
+// offset in this chunk.
+// locate() may only be used for offsets of actual bytes, and in particular
+// the end-of-file position (one past the last byte) MUST not be used. If the
+// caller wants to read from the end of file, it should simply read nothing.
 compression::chunk_and_offset
 compression::locate(uint64_t position) const {
    auto ucl = uncompressed_chunk_length();
@@ -310,6 +316,9 @@ public:
    virtual future<temporary_buffer<char>> skip(uint64_t n) override {
        _pos += n;
        assert(_pos <= _end_pos);
+        if (_pos == _end_pos) {
+            return make_ready_future<temporary_buffer<char>>();
+        }
        auto addr = _compression_metadata->locate(_pos);
        auto underlying_n = addr.chunk_start - _underlying_pos;
        _underlying_pos = addr.chunk_start;
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -1951,19 +1951,20 @@ void sstable_writer::prepare_file_writer()
    options.write_behind = 10;

    if (!_compression_enabled) {
-        _writer = make_shared<checksummed_file_writer>(std::move(_sst._data_file), std::move(options), true);
+        _writer = std::make_unique<checksummed_file_writer>(std::move(_sst._data_file), std::move(options), true);
    } else {
        prepare_compression(_sst._components->compression, _schema);
-        _writer = make_shared<file_writer>(make_compressed_file_output_stream(std::move(_sst._data_file), std::move(options), &_sst._components->compression));
+        _writer = std::make_unique<file_writer>(make_compressed_file_output_stream(std::move(_sst._data_file), std::move(options), &_sst._components->compression));
    }
 }

 void sstable_writer::finish_file_writer()
 {
-    _writer->close().get();
+    auto writer = std::move(_writer);
+    writer->close().get();

    if (!_compression_enabled) {
-        auto chksum_wr = static_pointer_cast<checksummed_file_writer>(_writer);
+        auto chksum_wr = static_cast<checksummed_file_writer*>(writer.get());
        write_digest(_sst._write_error_handler, _sst.filename(sstable::component_type::Digest), chksum_wr->full_checksum());
        write_crc(_sst._write_error_handler, _sst.filename(sstable::component_type::CRC), chksum_wr->finalize_checksum());
    } else {
@@ -1971,6 +1972,16 @@ void sstable_writer::finish_file_writer()
    }
 }

+sstable_writer::~sstable_writer() {
+    if (_writer) {
+        try {
+            _writer->close().get();
+        } catch (...) {
+            sstlog.error("sstable_writer failed to close file: {}", std::current_exception());
+        }
+    }
+}
+
 sstable_writer::sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
                               uint64_t max_sstable_size, bool backup, bool leave_unsealed, const io_priority_class& pc)
    : _sst(sst)
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -803,7 +803,7 @@ class sstable_writer {
    bool _backup;
    bool _leave_unsealed;
    bool _compression_enabled;
-    shared_ptr<file_writer> _writer;
+    std::unique_ptr<file_writer> _writer;
    stdx::optional<components_writer> _components_writer;
 private:
    void prepare_file_writer();
@@ -811,6 +811,10 @@ private:
 public:
    sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
                   uint64_t max_sstable_size, bool backup, bool leave_unsealed, const io_priority_class& pc);
+    ~sstable_writer();
+    sstable_writer(sstable_writer&& o) : _sst(o._sst), _schema(o._schema), _pc(o._pc), _backup(o._backup),
+            _leave_unsealed(o._leave_unsealed), _compression_enabled(o._compression_enabled), _writer(std::move(o._writer)),
+            _components_writer(std::move(o._components_writer)) {}
    void consume_new_partition(const dht::decorated_key& dk) { return _components_writer->consume_new_partition(dk); }
    void consume(tombstone t) { _components_writer->consume(t); }
    stop_iteration consume(static_row&& sr) { return _components_writer->consume(std::move(sr)); }
--- a/test.py
+++ b/test.py
@@ -78,6 +78,7 @@ boost_tests = [
    'virtual_reader_test',
    'view_schema_test',
    'counter_test',
+    'cell_locker_test',
 ]

 other_tests = [
--- a/tests/canonical_mutation_test.cc
+++ b/tests/canonical_mutation_test.cc
@@ -55,13 +55,13 @@ SEASTAR_TEST_CASE(test_reading_with_different_schemas) {
            canonical_mutation cm1(m1);
            canonical_mutation cm2(m2);

-            {
+            if (can_upgrade_schema(m1.schema(), m2.schema())) {
                auto m = cm1.to_mutation(m1.schema());
                m.upgrade(m2.schema());
                assert_that(cm1.to_mutation(m2.schema())).is_equal_to(m);
            }

-            {
+            if (can_upgrade_schema(m2.schema(), m1.schema())) {
                auto m = cm2.to_mutation(m2.schema());
                m.upgrade(m1.schema());
                assert_that(cm2.to_mutation(m1.schema())).is_equal_to(m);
--- a/tests/cell_locker_test.cc
+++ b/tests/cell_locker_test.cc
@@ -0,0 +1,218 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "tests/test-utils.hh"
+#include "disk-error-handler.hh"
+
+#include <seastar/core/thread.hh>
+
+#include "cell_locking.hh"
+#include "mutation.hh"
+#include "schema_builder.hh"
+
+thread_local disk_error_signal_type commit_error;
+thread_local disk_error_signal_type general_disk_error;
+
+static schema_ptr make_schema()
+{
+    return schema_builder("ks", "cf")
+            .with_column("pk", bytes_type, column_kind::partition_key)
+            .with_column("ck", bytes_type, column_kind::clustering_key)
+            .with_column("s1", bytes_type, column_kind::static_column)
+            .with_column("s2", bytes_type, column_kind::static_column)
+            .with_column("s3", bytes_type, column_kind::static_column)
+            .with_column("r1", bytes_type)
+            .with_column("r2", bytes_type)
+            .with_column("r3", bytes_type)
+            .build();
+}
+
+static schema_ptr make_alternative_schema()
+{
+    return schema_builder("ks", "cf")
+            .with_column("pk", bytes_type, column_kind::partition_key)
+            .with_column("ck", bytes_type, column_kind::clustering_key)
+            .with_column("s0", bytes_type, column_kind::static_column)
+            .with_column("s1", bytes_type, column_kind::static_column)
+            .with_column("s2.5", bytes_type, column_kind::static_column)
+            .with_column("s3", bytes_type, column_kind::static_column)
+            .with_column("r0", bytes_type)
+            .with_column("r1", bytes_type)
+            .with_column("r2.5", bytes_type)
+            .with_column("r3", bytes_type)
+            .build();
+}
+
+static schema_ptr make_schema_disjoint_with_others()
+{
+    return schema_builder("ks", "cf")
+            .with_column("pk", bytes_type, column_kind::partition_key)
+            .with_column("ck", bytes_type, column_kind::clustering_key)
+            .with_column("s8", bytes_type, column_kind::static_column)
+            .with_column("s9", bytes_type, column_kind::static_column)
+            .with_column("r8", bytes_type)
+            .with_column("r9", bytes_type)
+            .build();
+}
+
+static data_value empty_value = data_value(to_bytes(""));
+
+static auto make_row(const sstring& key, std::initializer_list<sstring> cells) {
+    return std::pair<sstring, std::initializer_list<sstring>>(key, cells);
+}
+
+static mutation make_mutation(schema_ptr s, const sstring& pk, std::initializer_list<sstring> static_cells,
+                              std::initializer_list<std::pair<sstring, std::initializer_list<sstring>>> clustering_cells)
+{
+    auto m = mutation(partition_key::from_single_value(*s, to_bytes(pk)), s);
+    for (auto&& c : static_cells) {
+        m.set_static_cell(to_bytes(c), empty_value, api::new_timestamp());
+    }
+    for (auto&& r : clustering_cells) {
+        auto ck = clustering_key::from_single_value(*s, to_bytes(r.first));
+        for (auto&& c : r.second) {
+            m.set_clustered_cell(ck, to_bytes(c), empty_value, api::new_timestamp());
+        }
+    }
+    return m;
+}
+
+SEASTAR_TEST_CASE(test_simple_locking_cells) {
+    return seastar::async([&] {
+        auto destroy = [] (auto) { };
+
+        auto s = make_schema();
+        cell_locker cl(s);
+
+        auto m = make_mutation(s, "0", { "s1", "s3" }, {
+            make_row("one", { "r1", "r2" }),
+            make_row("two", { "r2", "r3" }),
+        });
+
+        auto l1 = cl.lock_cells(m.decorated_key(), partition_cells_range(m.partition())).get0();
+        auto f2 = cl.lock_cells(m.decorated_key(), partition_cells_range(m.partition()));
+        BOOST_REQUIRE(!f2.available());
+
+        destroy(std::move(l1));
+        destroy(f2.get0());
+    });
+}
+
+SEASTAR_TEST_CASE(test_disjoint_mutations) {
+    return seastar::async([&] {
+        auto s = make_schema();
+        cell_locker cl(s);
+
+        auto m1 = make_mutation(s, "0", { "s1" }, {
+                make_row("one", { "r1", "r2" }),
+                make_row("two", { "r3" }),
+        });
+        auto m2 = make_mutation(s, "0", { "s2" }, {
+                make_row("two", { "r1", "r2" }),
+                make_row("one", { "r3" }),
+        });
+
+        auto m3 = mutation(partition_key::from_single_value(*s, to_bytes("1")), s);
+        m3.partition() = m1.partition();
+
+        auto l1 = cl.lock_cells(m1.decorated_key(), partition_cells_range(m1.partition())).get0();
+        auto l2 = cl.lock_cells(m2.decorated_key(), partition_cells_range(m2.partition())).get0();
+        auto l3 = cl.lock_cells(m3.decorated_key(), partition_cells_range(m3.partition())).get0();
+    });
+}
+
+SEASTAR_TEST_CASE(test_single_cell_overlap) {
+    return seastar::async([&] {
+        auto destroy = [] (auto) { };
+
+        auto s = make_schema();
+        cell_locker cl(s);
+
+        auto m1 = make_mutation(s, "0", { "s1" }, {
+                make_row("one", { "r1", "r2" }),
+                make_row("two", { "r3" }),
+        });
+        auto m2 = make_mutation(s, "0", { "s1" }, {
+                make_row("two", { "r1", "r2" }),
+                make_row("one", { "r3" }),
+        });
+        auto m3 = make_mutation(s, "0", { "s2" }, {
+                make_row("two", { "r1" }),
+                make_row("one", { "r2", "r3" }),
+        });
+
+        auto l1 = cl.lock_cells(m1.decorated_key(), partition_cells_range(m1.partition())).get0();
+        auto f2 = cl.lock_cells(m2.decorated_key(), partition_cells_range(m2.partition()));
+        BOOST_REQUIRE(!f2.available());
+        destroy(std::move(l1));
+        auto l2 = f2.get0();
+        auto f3 = cl.lock_cells(m3.decorated_key(), partition_cells_range(m3.partition()));
+        BOOST_REQUIRE(!f3.available());
+        destroy(std::move(l2));
+        auto l3 = f3.get0();
+    });
+}
+
+SEASTAR_TEST_CASE(test_schema_change) {
+    return seastar::async([&] {
+        auto destroy = [] (auto) { };
+
+        auto s1 = make_schema();
+        auto s2 = make_alternative_schema();
+        cell_locker cl(s1);
+
+        auto m1 = make_mutation(s1, "0", { "s1", "s2", "s3"}, {
+            make_row("one", { "r1", "r2", "r3" }),
+        });
+
+        // disjoint with m1
+        auto m2 = make_mutation(s2, "0", { "s0", "s2.5"}, {
+                make_row("one", { "r0", "r2.5" }),
+                make_row("two", { "r1", "r3" }),
+        });
+
+        // overlaps with m1
+        auto m3 = make_mutation(s2, "0", { "s1" }, {
+                make_row("one", { "r1", "r3" }),
+        });
+
+        auto l1 = cl.lock_cells(m1.decorated_key(), partition_cells_range(m1.partition())).get0();
+
+        destroy(std::move(m1));
+        destroy(std::move(s1));
+        cl.set_schema(s2);
+
+        auto l2 = cl.lock_cells(m2.decorated_key(), partition_cells_range(m2.partition())).get0();
+        auto f3 = cl.lock_cells(m3.decorated_key(), partition_cells_range(m3.partition()));
+        BOOST_REQUIRE(!f3.available());
+        destroy(std::move(l1));
+        auto l3 = f3.get0();
+
+        auto s3 = make_schema_disjoint_with_others();
+        cl.set_schema(s3);
+
+        auto m4 = make_mutation(s3, "0", { "s8", "s9"}, {
+                make_row("one", { "r8", "r9" }),
+                make_row("two", { "r8", "r9" }),
+        });
+        auto l4 = cl.lock_cells(m4.decorated_key(), partition_cells_range(m4.partition())).get0();
+    });
+}
--- a/tests/cql_test_env.cc
+++ b/tests/cql_test_env.cc
@@ -279,7 +279,7 @@ public:
            auto stop_ms = defer([&ms] { ms.stop().get(); });

            auto& ss = service::get_storage_service();
-            ss.start(std::ref(*db));
+            ss.start(std::ref(*db)).get();
            auto stop_storage_service = defer([&ss] { ss.stop().get(); });

            db->start(std::move(*cfg)).get();
--- a/tests/logalloc_test.cc
+++ b/tests/logalloc_test.cc
@@ -29,7 +29,9 @@
 #include <seastar/core/timer.hh>
 #include <seastar/core/sleep.hh>
 #include <seastar/tests/test-utils.hh>
+#include <seastar/util/defer.hh>
 #include <deque>
+#include "utils/phased_barrier.hh"

 #include "utils/logalloc.hh"
 #include "utils/managed_ref.hh"
@@ -529,11 +531,7 @@ inline void quiesce(FutureType&& fut) {
    // a request may be broken into many continuations. While we could just yield many times, the
    // exact amount needed to guarantee execution would be dependent on the internals of the
    // implementation, we want to avoid that.
-    timer<> tmr;
-    tmr.set_callback([] { BOOST_FAIL("The future we were waiting for took too long to get ready"); });
-    tmr.arm(2s);
-    fut.get();
-    tmr.cancel();
+    with_timeout(lowres_clock::now() + 2s, std::move(fut)).get();
 }

 // Simple RAII structure that wraps around a region_group
@@ -859,15 +857,22 @@ class test_reclaimer: public region_group_reclaimer {
    region_group _rg;
    std::vector<size_t> _reclaim_sizes;
    bool _shutdown = false;
+    shared_promise<> _unleash_reclaimer;
+    seastar::gate _reclaimers_done;
 public:
-    virtual void start_reclaiming() override {
-        while (this->under_pressure()) {
-            size_t reclaimed = test_async_reclaim_region::from_region(_rg.get_largest_region()).evict();
-            _result_accumulator->_reclaim_sizes.push_back(reclaimed);
-        }
+    virtual void start_reclaiming() noexcept override {
+        with_gate(_reclaimers_done, [this] {
+            return _unleash_reclaimer.get_shared_future().then([this] {
+                while (this->under_pressure()) {
+                    size_t reclaimed = test_async_reclaim_region::from_region(_rg.get_largest_region()).evict();
+                    _result_accumulator->_reclaim_sizes.push_back(reclaimed);
+                }
+            });
+        });
    }

    ~test_reclaimer() {
+        _reclaimers_done.close().get();
        _rg.shutdown().get();
    }

@@ -881,6 +886,10 @@ public:

    test_reclaimer(size_t threshold) : region_group_reclaimer(threshold), _result_accumulator(this), _rg(*this) {}
    test_reclaimer(test_reclaimer& parent, size_t threshold) : region_group_reclaimer(threshold), _result_accumulator(&parent), _rg(&parent._rg, *this) {}
+
+    void unleash() {
+        _unleash_reclaimer.set_value();
+    }
 };

 SEASTAR_TEST_CASE(test_region_groups_basic_throttling_simple_active_reclaim) {
@@ -888,6 +897,7 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_simple_active_reclaim) {
        // allocate a single region to exhaustion, and make sure active reclaim is activated.
        test_reclaimer simple(logalloc::segment_size);
        test_async_reclaim_region simple_region(simple.rg(), logalloc::segment_size);
+        simple.unleash();

        // Can't run this function until we have reclaimed something
        auto fut = simple.rg().run_when_memory_available([] {});
@@ -912,6 +922,7 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_worst_offen
        test_async_reclaim_region small_region(simple.rg(), logalloc::segment_size);
        test_async_reclaim_region medium_region(simple.rg(), 2 * logalloc::segment_size);
        test_async_reclaim_region big_region(simple.rg(), 3 * logalloc::segment_size);
+        simple.unleash();

        // Can't run this function until we have reclaimed
        auto fut = simple.rg().run_when_memory_available([&simple] {
@@ -941,6 +952,9 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_leaf_offend
        test_async_reclaim_region small_region(small_leaf.rg(), logalloc::segment_size);
        test_async_reclaim_region medium_region(root.rg(), 2 * logalloc::segment_size);
        test_async_reclaim_region big_region(large_leaf.rg(), 3 * logalloc::segment_size);
+        root.unleash();
+        large_leaf.unleash();
+        small_leaf.unleash();

        // Can't run this function until we have reclaimed. Try at the root, and we'll make sure
        // that the leaves are forced correctly.
@@ -967,6 +981,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_ancestor_bl
        test_reclaimer leaf(root, logalloc::segment_size);

        test_async_reclaim_region root_region(root.rg(), logalloc::segment_size);
+        root.unleash();
+        leaf.unleash();

        // Can't run this function until we have reclaimed. Try at the leaf, and we'll make sure
        // that the root reclaims
@@ -992,6 +1008,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_big_region_
        test_async_reclaim_region root_region(root.rg(), 4 * logalloc::segment_size);
        test_async_reclaim_region big_leaf_region(leaf.rg(), 3 * logalloc::segment_size);
        test_async_reclaim_region small_leaf_region(leaf.rg(), 2 * logalloc::segment_size);
+        root.unleash();
+        leaf.unleash();

        auto fut = root.rg().run_when_memory_available([&root] {
            BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 3);
@@ -1018,6 +1036,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_no_double_r
        test_reclaimer leaf(root, logalloc::segment_size);

        test_async_reclaim_region leaf_region(leaf.rg(), logalloc::segment_size);
+        root.unleash();
+        leaf.unleash();

        auto fut_root = root.rg().run_when_memory_available([&root] {
            BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 1);
@@ -1037,3 +1057,117 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_no_double_r
        BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[0], logalloc::segment_size);
    });
 }
+
+// Reproduces issue #2021
+SEASTAR_TEST_CASE(test_no_crash_when_a_lot_of_requests_released_which_change_region_group_size) {
+    return seastar::async([] {
+#ifndef DEFAULT_ALLOCATOR // Because we need memory::stats().free_memory();
+        logging::logger_registry().set_logger_level("lsa", seastar::log_level::debug);
+
+        auto free_space = memory::stats().free_memory();
+        size_t threshold = size_t(0.75 * free_space);
+        region_group_reclaimer recl(threshold, threshold);
+        region_group gr(recl);
+        auto close_gr = defer([&gr] { gr.shutdown().get(); });
+        region r(gr);
+
+        with_allocator(r.allocator(), [&] {
+            std::vector<managed_bytes> objs;
+
+            r.make_evictable([&] {
+                if (objs.empty()) {
+                    return memory::reclaiming_result::reclaimed_nothing;
+                }
+                with_allocator(r.allocator(), [&] {
+                    objs.pop_back();
+                });
+                return memory::reclaiming_result::reclaimed_something;
+            });
+
+            auto fill_to_pressure = [&] {
+                while (!recl.under_pressure()) {
+                    objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), 1024));
+                }
+            };
+
+            utils::phased_barrier request_barrier;
+            auto wait_for_requests = defer([&] { request_barrier.advance_and_await().get(); });
+
+            for (int i = 0; i < 1000000; ++i) {
+                fill_to_pressure();
+                future<> f = gr.run_when_memory_available([&, op = request_barrier.start()] {
+                    // Trigger group size change (Refs issue #2021)
+                    gr.update(-10);
+                    gr.update(+10);
+                });
+                BOOST_REQUIRE(!f.available());
+            }
+
+            // Release
+            while (recl.under_pressure()) {
+                objs.pop_back();
+            }
+        });
+#endif
+    });
+}
+
+SEASTAR_TEST_CASE(test_reclaiming_runs_as_long_as_there_is_soft_pressure) {
+    return seastar::async([] {
+        size_t hard_threshold = logalloc::segment_size * 8;
+        size_t soft_threshold = hard_threshold / 2;
+
+        class reclaimer : public region_group_reclaimer {
+            bool _reclaim = false;
+        protected:
+            void start_reclaiming() noexcept override {
+                _reclaim = true;
+            }
+
+            void stop_reclaiming() noexcept override {
+                _reclaim = false;
+            }
+        public:
+            reclaimer(size_t hard_threshold, size_t soft_threshold)
+                : region_group_reclaimer(hard_threshold, soft_threshold)
+            { }
+            bool reclaiming() const { return _reclaim; };
+        };
+
+        reclaimer recl(hard_threshold, soft_threshold);
+        region_group gr(recl);
+        auto close_gr = defer([&gr] { gr.shutdown().get(); });
+        region r(gr);
+
+        with_allocator(r.allocator(), [&] {
+            std::vector<managed_bytes> objs;
+
+            BOOST_REQUIRE(!recl.reclaiming());
+
+            while (!recl.over_soft_limit()) {
+                objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), logalloc::segment_size));
+            }
+
+            BOOST_REQUIRE(recl.reclaiming());
+
+            while (!recl.under_pressure()) {
+                objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), logalloc::segment_size));
+            }
+
+            BOOST_REQUIRE(recl.reclaiming());
+
+            while (recl.under_pressure()) {
+                objs.pop_back();
+            }
+
+            BOOST_REQUIRE(recl.over_soft_limit());
+            BOOST_REQUIRE(recl.reclaiming());
+
+            while (recl.over_soft_limit()) {
+                objs.pop_back();
+            }
+
+            BOOST_REQUIRE(!recl.reclaiming());
+        });
+    });
+}
--- a/tests/lsa_async_eviction_test.cc
+++ b/tests/lsa_async_eviction_test.cc
@@ -76,13 +76,16 @@ int main(int argc, char** argv) {
                });

                uint64_t counter = 0;
+                logalloc::allocating_section alloc_sect;
+                alloc_sect.set_lsa_reserve(0);
+                alloc_sect.set_std_reserve(0);

                while (counter < obj_count) {
-                    auto obj = managed_bytes(managed_bytes::initialized_later(), obj_size);
-                    {
+                    alloc_sect(r, [&] {
+                        auto obj = managed_bytes(managed_bytes::initialized_later(), obj_size);
                        logalloc::reclaim_lock l(r);
                        refs.push_back(std::move(obj));
-                    }
+                    });

                    ++counter;

--- a/tests/mutation_source_test.cc
+++ b/tests/mutation_source_test.cc
@@ -191,7 +191,6 @@ static mutation_sets generate_mutation_sets() {
                .with_column("ck_col_2", bytes_type, column_kind::clustering_key)
                .with_column("regular_col_1", bytes_type)
                .with_column("regular_col_2", bytes_type)
-                .with_column("regular_counter_col_1", counter_type)
                .with_column("static_col_1", bytes_type, column_kind::static_column)
                .with_column("static_col_2", bytes_type, column_kind::static_column);

@@ -300,9 +299,20 @@ static mutation_sets generate_mutation_sets() {
        }
    }

+    static constexpr auto rmg_iterations = 10;
+
    {
-        random_mutation_generator gen;
-        for (int i = 0; i < 10; ++i) {
+        random_mutation_generator gen(random_mutation_generator::generate_counters::no);
+        for (int i = 0; i < rmg_iterations; ++i) {
+            auto m = gen();
+            result.unequal.emplace_back(mutations{m, gen()}); // collision unlikely
+            result.equal.emplace_back(mutations{m, m});
+        }
+    }
+
+    {
+        random_mutation_generator gen(random_mutation_generator::generate_counters::yes);
+        for (int i = 0; i < rmg_iterations; ++i) {
            auto m = gen();
            result.unequal.emplace_back(mutations{m, gen()}); // collision unlikely
            result.equal.emplace_back(mutations{m, m});
@@ -364,6 +374,7 @@ bytes make_blob(size_t blob_size) {

 class random_mutation_generator::impl {
    friend class random_mutation_generator;
+    generate_counters _generate_counters;
    const size_t _external_blob_size = 128; // Should be enough to force use of external bytes storage
    const column_id column_count = row::max_vector_size * 2;
    std::mt19937 _gen;
@@ -375,30 +386,33 @@ class random_mutation_generator::impl {
        return gc_clock::time_point() + std::chrono::seconds(dist(gen));
    }

-public:
-    schema_ptr make_schema() {
+    schema_ptr do_make_schema(data_type type) {
        auto builder = schema_builder("ks", "cf")
                .with_column("pk", bytes_type, column_kind::partition_key)
                .with_column("ck1", bytes_type, column_kind::clustering_key)
-                .with_column("ck2", bytes_type, column_kind::clustering_key)
-                .with_column("c1", counter_type);
+                .with_column("ck2", bytes_type, column_kind::clustering_key);

        // Create enough columns so that row can overflow its vector storage
        for (column_id i = 0; i < column_count; ++i) {
            {
                auto column_name = sprint("v%d", i);
-                builder.with_column(to_bytes(column_name), bytes_type, column_kind::regular_column);
+                builder.with_column(to_bytes(column_name), type, column_kind::regular_column);
            }
            {
                auto column_name = sprint("s%d", i);
-                builder.with_column(to_bytes(column_name), bytes_type, column_kind::static_column);
+                builder.with_column(to_bytes(column_name), type, column_kind::static_column);
            }
        }

        return builder.build();
    }

-    impl() {
+    schema_ptr make_schema() {
+        return _generate_counters ? do_make_schema(counter_type)
+                                  : do_make_schema(bytes_type);
+    }
+public:
+    explicit impl(generate_counters counters) : _generate_counters(counters) {
        _schema = make_schema();

        for (int i = 0; i < 1024; ++i) {
@@ -424,8 +438,6 @@ public:
        auto pkey = partition_key::from_single_value(*_schema, _blobs[0]);
        mutation m(pkey, _schema);

-        auto& counter_column = *_schema->get_column_definition(utf8_type->decompose(sstring("c1")));
-
        std::map<counter_id, std::set<int64_t>> counter_used_clock_values;
        std::vector<counter_id> counter_ids;
        std::generate_n(std::back_inserter(counter_ids), 8, counter_id::generate_random);
@@ -459,16 +471,16 @@ public:
            auto columns_to_set = column_count_dist(_gen);
            for (column_id i = 0; i < columns_to_set; ++i) {
                auto cid = column_id_dist(_gen);
-                if (kind == column_kind::regular_column && cid == counter_column.id) {
-                    auto cell = bool_dist(_gen)
-                                ? random_counter_cell()
-                                : atomic_cell::make_dead(timestamp_dist(_gen), expiry_dist(_gen));
-                    r.apply(_schema->column_at(kind, cid), std::move(cell));
-                    continue;
-                }
+                auto get_live_cell = [&] {
+                    if (_generate_counters) {
+                        return random_counter_cell();
+                    } else {
+                        return atomic_cell::make_live(timestamp_dist(_gen), _blobs[value_blob_index_dist(_gen)]);
+                    }
+                };
                // FIXME: generate expiring cells
                auto cell = bool_dist(_gen)
-                            ? atomic_cell::make_live(timestamp_dist(_gen), _blobs[value_blob_index_dist(_gen)])
+                            ? get_live_cell()
                            : atomic_cell::make_dead(timestamp_dist(_gen), expiry_dist(_gen));
                r.apply(_schema->column_at(kind, cid), std::move(cell));
            }
@@ -529,8 +541,8 @@ public:

 random_mutation_generator::~random_mutation_generator() {}

-random_mutation_generator::random_mutation_generator()
-    : _impl(std::make_unique<random_mutation_generator::impl>())
+random_mutation_generator::random_mutation_generator(generate_counters counters)
+    : _impl(std::make_unique<random_mutation_generator::impl>(counters))
 { }

 mutation random_mutation_generator::operator()() {
--- a/tests/mutation_source_test.hh
+++ b/tests/mutation_source_test.hh
@@ -37,11 +37,19 @@ void for_each_mutation_pair(std::function<void(const mutation&, const mutation&,
 // Calls the provided function on mutations. Is supposed to exercise as many differences as possible.
 void for_each_mutation(std::function<void(const mutation&)>);

+// Returns true if mutations in schema s1 can be upgraded to s2.
+inline bool can_upgrade_schema(schema_ptr from, schema_ptr to) {
+    return from->is_counter() == to->is_counter();
+}
+
 class random_mutation_generator {
    class impl;
    std::unique_ptr<impl> _impl;
 public:
-    random_mutation_generator();
+    struct generate_counters_tag { };
+    using generate_counters = bool_class<generate_counters_tag>;
+
+    explicit random_mutation_generator(generate_counters);
    ~random_mutation_generator();
    mutation operator()();
    schema_ptr schema() const;
--- a/tests/mutation_test.cc
+++ b/tests/mutation_test.cc
@@ -795,8 +795,7 @@ public:
 };

 SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) {
-    random_mutation_generator gen;
-
+  auto do_test = [] (auto&& gen) {
    failure_injecting_allocation_strategy alloc(standard_allocator());
    with_allocator(alloc, [&] {
        auto target = gen();
@@ -857,7 +856,10 @@ SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) {
            }
        }
    });
+  };

+    do_test(random_mutation_generator(random_mutation_generator::generate_counters::no));
+    do_test(random_mutation_generator(random_mutation_generator::generate_counters::yes));
    return make_ready_future<>();
 }

--- a/tracing/trace_keyspace_helper.cc
+++ b/tracing/trace_keyspace_helper.cc
@@ -238,7 +238,8 @@ future<> trace_keyspace_helper::start() {
                std::map<sstring, sstring> opts;
                opts["replication_factor"] = "2";
                auto ksm = keyspace_metadata::new_keyspace(KEYSPACE_NAME, "org.apache.cassandra.locator.SimpleStrategy", std::move(opts), true);
-                service::get_local_migration_manager().announce_new_keyspace(ksm, false).get();
+                // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments. See issue #2129.
+                service::get_local_migration_manager().announce_new_keyspace(ksm, api::min_timestamp, false).get();
            }

            // Create tables
--- a/transport/server.cc
+++ b/transport/server.cc
@@ -655,9 +655,9 @@ future<> cql_server::connection::process_request() {
                auto bv = bytes_view{reinterpret_cast<const int8_t*>(buf.begin()), buf.size()};
                auto cpu = pick_request_cpu();
                return smp::submit_to(cpu, [this, bv = std::move(bv), op, stream, client_state = _client_state, tracing_requested] () mutable {
-                    return this->process_request_one(bv, op, stream, std::move(client_state), tracing_requested).then([](auto&& response) {
+                    return this->process_request_one(bv, op, stream, std::move(client_state), tracing_requested).then([tracing_requested] (auto&& response) {
                        auto& tracing_session_id_ptr = response.second.tracing_session_id_ptr();
-                        if (tracing_session_id_ptr) {
+                        if (tracing_requested == tracing_request_type::write_on_close && tracing_session_id_ptr) {
                            response.first->set_tracing_id(*tracing_session_id_ptr);
                        }
                        return std::make_pair(make_foreign(response.first), response.second);
--- a/types.hh
+++ b/types.hh
@@ -1166,6 +1166,18 @@ shared_ptr<const abstract_type> data_type_for<bool>() {
    return boolean_type;
 }

+template <>
+inline
+shared_ptr<const abstract_type> data_type_for<float>() {
+    return float_type;
+}
+
+template <>
+inline
+shared_ptr<const abstract_type> data_type_for<double>() {
+    return double_type;
+}
+
 namespace std {

 template <>
--- a/utils/logalloc.cc
+++ b/utils/logalloc.cc
@@ -2065,56 +2065,6 @@ uint64_t region_group::top_region_evictable_space() const {
    return _regions.empty() ? 0 : _regions.top()->evictable_occupancy().total_space();
 }

-void region_group::release_requests() noexcept {
-    // The later() statement is here  to avoid executing the function in update() context. But
-    // also guarantees that we won't dominate the CPU if we have many requests to release.
-    //
-    // However, both with_gate() and later() can ultimately call to schedule() and consequently
-    // allocate memory, which (if that allocation triggers a compaction - that frees memory) would
-    // defeat the very purpose of not executing this on update() context. Allocations should be rare
-    // on those but can happen, so we need to at least make sure they will not reclaim.
-    //
-    // Whatever comes after later() is already in a safe context, so we don't need to keep the lock
-    // alive until we are done with the whole execution - only until later is successfully executed.
-    tracker_reclaimer_lock rl;
-
-    _reclaimer.notify_relief();
-    if (_descendant_blocked_requests) {
-        _descendant_blocked_requests->set_value();
-    }
-    _descendant_blocked_requests = {};
-
-    if (_blocked_requests.empty()) {
-        return;
-    }
-
-    with_gate(_asynchronous_gate, [this, rl = std::move(rl)] () mutable {
-        return later().then([this] {
-            // Check again, we may have executed release_requests() in this mean time from another entry
-            // point (for instance, a descendant notification)
-            if (_blocked_requests.empty()) {
-                return;
-            }
-
-            auto blocked_at = do_for_each_parent(this, [] (auto rg) {
-                return rg->execution_permitted() ? stop_iteration::no : stop_iteration::yes;
-            });
-
-            if (!blocked_at) {
-                auto req = std::move(_blocked_requests.front());
-                _blocked_requests.pop_front();
-                req->allocate();
-                release_requests();
-            } else {
-                // If someone blocked us in the mean time then we can't execute. We need to make
-                // sure that we are listening to notifications, though. It could be that we used to
-                // be blocked on ourselves and now we are blocking on an ancestor
-                subscribe_for_ancestor_available_memory_notification(blocked_at);
-            }
-        });
-    });
-}
-
 region* region_group::get_largest_region() {
    if (!_maximal_rg || _maximal_rg->_regions.empty()) {
        return nullptr;
@@ -2148,6 +2098,88 @@ region_group::del(region_impl* child) {
    update(-child->occupancy().total_space());
 }

+bool
+region_group::execution_permitted() noexcept {
+    return do_for_each_parent(this, [] (auto rg) {
+        return rg->under_pressure() ? stop_iteration::yes : stop_iteration::no;
+    }) == nullptr;
+}
+
+future<>
+region_group::start_releaser() {
+    return later().then([this] {
+        return repeat([this] () noexcept {
+            if (_shutdown_requested) {
+                return make_ready_future<stop_iteration>(stop_iteration::yes);
+            }
+
+            if (!_blocked_requests.empty() && execution_permitted()) {
+                auto req = std::move(_blocked_requests.front());
+                _blocked_requests.pop_front();
+                req->allocate();
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            } else {
+                // Block reclaiming to prevent signal() from being called by reclaimer inside wait()
+                // FIXME: handle allocation failures (not very likely) like allocating_section does
+                tracker_reclaimer_lock rl;
+                return _relief.wait().then([] {
+                    return stop_iteration::no;
+                });
+            }
+        });
+    });
+}
+
+region_group::region_group(region_group *parent, region_group_reclaimer& reclaimer)
+    : _parent(parent)
+    , _reclaimer(reclaimer)
+    , _releaser(reclaimer_can_block() ? start_releaser() : make_ready_future<>())
+{
+    if (_parent) {
+        _parent->add(this);
+    }
+}
+
+bool region_group::reclaimer_can_block() const {
+    return _reclaimer.throttle_threshold() != std::numeric_limits<size_t>::max();
+}
+
+void region_group::notify_relief() {
+    _relief.signal();
+    for (region_group* child : _subgroups) {
+        child->notify_relief();
+    }
+}
+
+void region_group::update(ssize_t delta) {
+    // Most-enclosing group which was relieved.
+    region_group* top_relief = nullptr;
+
+    do_for_each_parent(this, [&top_relief, delta] (region_group* rg) mutable {
+        rg->update_maximal_rg();
+        rg->_total_memory += delta;
+
+        if (rg->_total_memory >= rg->_reclaimer.soft_limit_threshold()) {
+            rg->_reclaimer.notify_soft_pressure();
+        } else {
+            rg->_reclaimer.notify_soft_relief();
+        }
+
+        if (rg->_total_memory > rg->_reclaimer.throttle_threshold()) {
+            rg->_reclaimer.notify_pressure();
+        } else if (rg->_reclaimer.under_pressure()) {
+            rg->_reclaimer.notify_relief();
+            top_relief = rg;
+        }
+
+        return stop_iteration::no;
+    });
+
+    if (top_relief) {
+        top_relief->notify_relief();
+    }
+}
+
 allocating_section::guard::guard()
    : _prev(shard_segment_pool.emergency_reserve_max())
 { }
@@ -2196,6 +2228,14 @@ void allocating_section::on_alloc_failure() {

 #endif

+void allocating_section::set_lsa_reserve(size_t reserve) {
+    _lsa_reserve = reserve;
+}
+
+void allocating_section::set_std_reserve(size_t reserve) {
+    _std_reserve = reserve;
+}
+
 void region_group::on_request_expiry::operator()(std::unique_ptr<allocating_function>& func) noexcept {
    func->fail(std::make_exception_ptr(timed_out_error()));
 }
--- a/utils/logalloc.hh
+++ b/utils/logalloc.hh
@@ -64,8 +64,20 @@ protected:
    size_t _soft_limit;
    bool _under_pressure = false;
    bool _under_soft_pressure = false;
-    virtual void start_reclaiming() {}
-    virtual void stop_reclaiming() {}
+    // The following restrictions apply to implementations of start_reclaiming() and stop_reclaiming():
+    //
+    //  - must not use any region or region_group objects, because they're invoked synchronously
+    //    with operations on those.
+    //
+    //  - must be noexcept, because they're called on the free path.
+    //
+    //  - the implementation may be called synchronously with any operation
+    //    which allocates memory, because these are called by memory reclaimer.
+    //    In particular, the implementation should not depend on memory allocation
+    //    because that may fail when in reclaiming context.
+    //
+    virtual void start_reclaiming() noexcept {}
+    virtual void stop_reclaiming() noexcept {}
 public:
    bool under_pressure() const {
        return _under_pressure;
@@ -75,32 +87,26 @@ public:
        return _under_soft_pressure;
    }

-    void notify_soft_pressure() {
+    void notify_soft_pressure() noexcept {
        if (!_under_soft_pressure) {
            _under_soft_pressure = true;
            start_reclaiming();
        }
    }

-    void notify_soft_relief() {
+    void notify_soft_relief() noexcept {
        if (_under_soft_pressure) {
            _under_soft_pressure = false;
            stop_reclaiming();
        }
    }

-    void notify_pressure() {
-        if (!_under_pressure) {
-            _under_pressure = true;
-            start_reclaiming();
-        }
+    void notify_pressure() noexcept {
+        _under_pressure = true;
    }

-    void notify_relief() {
-        if (_under_pressure) {
-            _under_pressure = false;
-            stop_reclaiming();
-        }
+    void notify_relief() noexcept {
+        _under_pressure = false;
    }

    region_group_reclaimer()
@@ -108,7 +114,9 @@ public:
    region_group_reclaimer(size_t threshold)
        : _threshold(threshold), _soft_limit(threshold) {}
    region_group_reclaimer(size_t threshold, size_t soft)
-        : _threshold(threshold), _soft_limit(soft) {}
+        : _threshold(threshold), _soft_limit(soft) {
+        assert(_soft_limit <= _threshold);
+    }

    virtual ~region_group_reclaimer() {}

@@ -229,9 +237,13 @@ class region_group {
    // a different ancestor)
    std::experimental::optional<shared_promise<>> _descendant_blocked_requests = {};

-    region_group* _waiting_on_ancestor = nullptr;
-    seastar::gate _asynchronous_gate;
+    condition_variable _relief;
+    future<> _releaser;
    bool _shutdown_requested = false;
+
+    bool reclaimer_can_block() const;
+    future<> start_releaser();
+    void notify_relief();
 public:
    // When creating a region_group, one can specify an optional throttle_threshold parameter. This
    // parameter won't affect normal allocations, but an API is provided, through the region_group's
@@ -239,17 +251,13 @@ public:
    // the total memory for the region group (and all of its parents) is lower or equal to the
    // region_group's throttle_treshold (and respectively for its parents).
    region_group(region_group_reclaimer& reclaimer = no_reclaimer) : region_group(nullptr, reclaimer) {}
-    region_group(region_group* parent, region_group_reclaimer& reclaimer = no_reclaimer) : _parent(parent), _reclaimer(reclaimer) {
-        if (_parent) {
-            _parent->add(this);
-        }
-    }
+    region_group(region_group* parent, region_group_reclaimer& reclaimer = no_reclaimer);
    region_group(region_group&& o) = delete;
    region_group(const region_group&) = delete;
    ~region_group() {
        // If we set a throttle threshold, we'd be postponing many operations. So shutdown must be
        // called.
-        if (_reclaimer.throttle_threshold() != std::numeric_limits<size_t>::max()) {
+        if (reclaimer_can_block()) {
            assert(_shutdown_requested);
        }
        if (_parent) {
@@ -261,24 +269,7 @@ public:
    size_t memory_used() const {
        return _total_memory;
    }
-    void update(ssize_t delta) {
-        do_for_each_parent(this, [delta] (auto rg) mutable {
-            rg->update_maximal_rg();
-            rg->_total_memory += delta;
-            // It is okay to call release_requests for a region_group that can't allow execution.
-            // But that can generate various spurious messages to groups waiting on us that will be
-            // then woken up just so they can go to wait again. So let's filter that.
-            if (rg->execution_permitted()) {
-                rg->release_requests();
-            }
-            if (rg->_total_memory >= rg->_reclaimer.soft_limit_threshold()) {
-                rg->_reclaimer.notify_soft_pressure();
-            } else if (rg->_total_memory < rg->_reclaimer.soft_limit_threshold()) {
-                rg->_reclaimer.notify_soft_relief();
-            }
-            return stop_iteration::no;
-        });
-    }
+    void update(ssize_t delta);

    // It would be easier to call update, but it is unfortunately broken in boost versions up to at
    // least 1.59.
@@ -324,36 +315,18 @@ public:
        using futurator = futurize<std::result_of_t<Func()>>;

        auto blocked_at = do_for_each_parent(this, [] (auto rg) {
-            return (rg->_blocked_requests.empty() && rg->execution_permitted()) ? stop_iteration::no : stop_iteration::yes;
+            return (rg->_blocked_requests.empty() && !rg->under_pressure()) ? stop_iteration::no : stop_iteration::yes;
        });

        if (!blocked_at) {
            return futurator::apply(func);
        }
-        subscribe_for_ancestor_available_memory_notification(blocked_at);

        auto fn = std::make_unique<concrete_allocating_function<Func>>(std::forward<Func>(func));
        auto fut = fn->get_future();
        _blocked_requests.push_back(std::move(fn), timeout);
        ++_blocked_requests_counter;

-        // This is called here, and not at update(), for two reasons: the first, is that things that
-        // are done during the free() path should be done carefuly, in the sense that they can
-        // trigger another update call and put us in a loop. Not to mention we would like to keep
-        // those from having exceptions. We solve that for release_requests by using later(), but in
-        // here we can do away with that need altogether.
-        //
-        // Second and most important, until we actually block a request, the pressure condition may
-        // very well be transient. There are opportunities for compactions, the condition can go
-        // away on its own, etc.
-        //
-        // The reason we check execution permitted(), is that we'll still block requests if we have
-        // free memory but existing requests in the queue. That is so we can keep our FIFO ordering
-        // guarantee. So we need to distinguish here the case in which we're blocking merely to
-        // serialize requests, so that the caller does not evict more than it should.
-        if (!blocked_at->execution_permitted()) {
-            blocked_at->_reclaimer.notify_pressure();
-        }
        return fut;
    }

@@ -363,9 +336,11 @@ public:
    region* get_largest_region();

    // Shutdown is mandatory for every user who has set a threshold
+    // Can be called at most once.
    future<> shutdown() {
        _shutdown_requested = true;
-        return _asynchronous_gate.close();
+        _relief.signal();
+        return std::move(_releaser);
    }

    size_t blocked_requests() {
@@ -376,43 +351,9 @@ public:
        return _blocked_requests_counter;
    }
 private:
-    // Make sure we get a notification and can call release_requests when one of our ancestors that
-    // used to block us is no longer under memory pressure.
-    void subscribe_for_ancestor_available_memory_notification(region_group *ancestor) {
-        if ((this == ancestor) || (_waiting_on_ancestor)) {
-            return; // already subscribed, or no need to
-        }
-
-        _waiting_on_ancestor = ancestor;
-
-        with_gate(_asynchronous_gate, [this] {
-            // We reevaluate _waiting_on_ancestor here so we make sure there is no deferring point
-            // between determining the ancestor and registering with it for a notification. We start
-            // with _waiting_on_ancestor set to the initial value, and after we are notified, we
-            // will set _waiting_on_ancestor to nullptr to force this lambda to reevaluate it.
-            auto evaluate_ancestor_and_stop = [this] {
-                if (!_waiting_on_ancestor) {
-                    auto new_blocking_point = do_for_each_parent(this, [] (auto rg) {
-                        return (rg->execution_permitted()) ? stop_iteration::no : stop_iteration::yes;
-                    });
-                    if (!new_blocking_point) {
-                        release_requests();
-                    }
-                    _waiting_on_ancestor = (new_blocking_point == this) ? nullptr : new_blocking_point;
-                }
-                return _waiting_on_ancestor == nullptr;
-            };
-
-            return do_until(evaluate_ancestor_and_stop, [this] {
-                if (!_waiting_on_ancestor->_descendant_blocked_requests) {
-                    _waiting_on_ancestor->_descendant_blocked_requests = shared_promise<>();
-                }
-                return _waiting_on_ancestor->_descendant_blocked_requests->get_shared_future().then([this] {
-                    _waiting_on_ancestor = nullptr;
-                });
-            });
-        });
-    }
+    // Returns true if and only if constraints of this group are not violated.
+    // That's taking into account any constraints imposed by enclosing (parent) groups.
+    bool execution_permitted() noexcept;

    // Executes the function func for each region_group upwards in the hierarchy, starting with the
    // parameter node. The function func may return stop_iteration::no, in which case it proceeds to
@@ -432,11 +373,10 @@ private:
        }
        return nullptr;
    }
-    inline bool execution_permitted() const {
-        return _total_memory <= _reclaimer.throttle_threshold();
-    }

-    void release_requests() noexcept;
+    inline bool under_pressure() const {
+        return _reclaimer.under_pressure();
+    }

    uint64_t top_region_evictable_space() const;

@@ -687,6 +627,9 @@ private:
    };
    void on_alloc_failure();
 public:
+    void set_lsa_reserve(size_t);
+    void set_std_reserve(size_t);
+
    //
    // Invokes func with reclaim_lock on region r. If LSA allocation fails
    // inside func it is retried after increasing LSA segment reserve. The
				`@@ -0,0 +1 @@`
				`options raid0 devices_discard_performance=Y`