release: prepare for 1.5.2

repair: Fix midpoint is not contained in the split range assertion in split_and_add
We have: auto halves = range.split(midpoint, dht::token_comparator()); We saw a case where midpoint == range.start, as a result, range.split will assert becasue the range.start is marked non-inclusive, so the midpoint doesn't appear to be contain()ed in the range - hence the assertion failure. Fixes #2148 Signed-off-by: Nadav Har'El <nyh@scylladb.com> Signed-off-by: Asias He <asias@scylladb.com> Message-Id: <93af2697637c28fbca261ddfb8375a790824df65.1489023933.git.asias@scylladb.com> (cherry picked from commit 39d2e59e7e)
2017-03-09 21:47:51 +02:00 · 2017-03-09 10:39:19 +02:00 · 2017-03-08 12:36:08 +02:00 · 2017-03-07 11:17:06 +02:00 · 2017-03-01 18:50:55 +02:00 · 2017-02-28 11:26:21 +02:00
71 changed files with 1598 additions and 808 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=1.5.2

 if test -f version
 then
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -777,7 +777,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/read/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/read/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
@@ -792,7 +792,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/range/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/range/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
@@ -942,7 +942,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/write/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/write/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -194,7 +194,7 @@ void set_cache_service(http_context& ctx, routes& r) {
    });

    cs::get_row_capacity.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t(0), [](const column_family& cf) {
            return cf.get_row_cache().get_cache_tracker().region().occupancy().used_space();
        }, std::plus<uint64_t>());
    });
--- a/auth/data_resource.cc
+++ b/auth/data_resource.cc
@@ -47,11 +47,8 @@
 const sstring auth::data_resource::ROOT_NAME("data");

 auth::data_resource::data_resource(level l, const sstring& ks, const sstring& cf)
-    : _ks(ks), _cf(cf)
+    : _level(l), _ks(ks), _cf(cf)
 {
-    if (l != get_level()) {
-        throw std::invalid_argument("level/keyspace/column mismatch");
-    }
 }

 auth::data_resource::data_resource()
@@ -67,14 +64,7 @@ auth::data_resource::data_resource(const sstring& ks, const sstring& cf)
 {}

 auth::data_resource::level auth::data_resource::get_level() const {
-    if (!_cf.empty()) {
-        assert(!_ks.empty());
-        return level::COLUMN_FAMILY;
-    }
-    if (!_ks.empty()) {
-        return level::KEYSPACE;
-    }
-    return level::ROOT;
+    return _level;
 }

 auth::data_resource auth::data_resource::from_name(
--- a/auth/data_resource.hh
+++ b/auth/data_resource.hh
@@ -56,6 +56,7 @@ private:

    static const sstring ROOT_NAME;

+    level _level;
    sstring _ks;
    sstring _cf;

--- a/auth/permission.cc
+++ b/auth/permission.cc
@@ -40,6 +40,7 @@
 */

 #include <unordered_map>
+#include <boost/algorithm/string.hpp>
 #include "permission.hh"

 const auth::permission_set auth::permissions::ALL_DATA =
@@ -75,7 +76,9 @@ const sstring& auth::permissions::to_string(permission p) {
 }

 auth::permission auth::permissions::from_string(const sstring& s) {
-    return permission_names.at(s);
+    sstring upper(s);
+    boost::to_upper(upper);
+    return permission_names.at(upper);
 }

 std::unordered_set<sstring> auth::permissions::to_strings(const permission_set& set) {
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -409,29 +409,6 @@ partitioner: org.apache.cassandra.dht.Murmur3Partitioner
 # the smaller of 1/4 of heap or 512MB.
 # file_cache_size_in_mb: 512

-# Total permitted memory to use for memtables. Scylla will stop 
-# accepting writes when the limit is exceeded until a flush completes,
-# and will trigger a flush based on memtable_cleanup_threshold
-# If omitted, Scylla will set both to 1/4 the size of the heap.
-# memtable_heap_space_in_mb: 2048
-# memtable_offheap_space_in_mb: 2048
-
-# Ratio of occupied non-flushing memtable size to total permitted size
-# that will trigger a flush of the largest memtable.  Lager mct will
-# mean larger flushes and hence less compaction, but also less concurrent
-# flush activity which can make it difficult to keep your disks fed
-# under heavy write load.
-#
-# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1)
-# memtable_cleanup_threshold: 0.11
-
-# Specify the way Scylla allocates and manages memtable memory.
-# Options are:
-#   heap_buffers:    on heap nio buffers
-#   offheap_buffers: off heap (direct) nio buffers
-#   offheap_objects: native memory, eliminating nio buffer heap overhead
-# memtable_allocation_type: heap_buffers
-
 # Total space to use for commitlogs.
 #
 # If space gets above this value (it will round up to the next nearest
@@ -443,17 +420,6 @@ partitioner: org.apache.cassandra.dht.Murmur3Partitioner
 # available for Scylla.
 commitlog_total_space_in_mb: -1

-# This sets the amount of memtable flush writer threads.  These will
-# be blocked by disk io, and each one will hold a memtable in memory
-# while blocked. 
-#
-# memtable_flush_writers defaults to the smaller of (number of disks,
-# number of cores), with a minimum of 2 and a maximum of 8.
-# 
-# If your data directories are backed by SSD, you should increase this
-# to the number of cores.
-#memtable_flush_writers: 8
-
 # A fixed memory pool size in MB for for SSTable index summaries. If left
 # empty, this will default to 5% of the heap size. If the memory usage of
 # all index summaries exceeds this limit, SSTables with low read rates will
--- a/configure.py
+++ b/configure.py
@@ -221,6 +221,7 @@ scylla_tests = [
    'tests/database_test',
    'tests/nonwrapping_range_test',
    'tests/input_stream_test',
+    'tests/sstable_atomic_deletion_test',
 ]

 apps = [
@@ -307,6 +308,7 @@ scylla_core = (['database.cc',
                 'sstables/compaction.cc',
                 'sstables/compaction_strategy.cc',
                 'sstables/compaction_manager.cc',
+                 'sstables/atomic_deletion.cc',
                 'transport/event.cc',
                 'transport/event_notifier.cc',
                 'transport/server.cc',
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -232,7 +232,7 @@ uint32_t selection::add_column_for_ordering(const column_definition& c) {
            raw_selector::to_selectables(raw_selectors, schema), db, schema, defs);

    auto metadata = collect_metadata(schema, raw_selectors, *factories);
-    if (processes_selection(raw_selectors)) {
+    if (processes_selection(raw_selectors) || raw_selectors.size() != defs.size()) {
        return ::make_shared<selection_with_processing>(schema, std::move(defs), std::move(metadata), std::move(factories));
    } else {
        return ::make_shared<simple_selection>(schema, std::move(defs), std::move(metadata), false);
--- a/database.cc
+++ b/database.cc
@@ -91,34 +91,33 @@ public:

 // Used for tests where the CF exists without a database object. We need to pass a valid
 // dirty_memory manager in that case.
-thread_local memtable_dirty_memory_manager default_dirty_memory_manager;
+thread_local dirty_memory_manager default_dirty_memory_manager;

 lw_shared_ptr<memtable_list>
 column_family::make_memory_only_memtable_list() {
-    auto seal = [this] (memtable_list::flush_behavior ignored) { return make_ready_future<>(); };
    auto get_schema = [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.max_memtable_size, _config.dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(get_schema), _config.dirty_memory_manager);
 }

 lw_shared_ptr<memtable_list>
 column_family::make_memtable_list() {
    auto seal = [this] (memtable_list::flush_behavior behavior) { return seal_active_memtable(behavior); };
    auto get_schema = [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.max_memtable_size, _config.dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.dirty_memory_manager);
 }

 lw_shared_ptr<memtable_list>
 column_family::make_streaming_memtable_list() {
    auto seal = [this] (memtable_list::flush_behavior behavior) { return seal_active_streaming_memtable(behavior); };
    auto get_schema =  [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.max_streaming_memtable_size, _config.streaming_dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager);
 }

 lw_shared_ptr<memtable_list>
 column_family::make_streaming_memtable_big_list(streaming_memtable_big& smb) {
    auto seal = [this, &smb] (memtable_list::flush_behavior) { return seal_active_streaming_memtable_big(smb); };
    auto get_schema =  [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.max_streaming_memtable_size, _config.streaming_dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager);
 }

 column_family::column_family(schema_ptr schema, config config, db::commitlog* cl, compaction_manager& compaction_manager)
@@ -823,6 +822,12 @@ future<> column_family::load_sstable(sstables::sstable&& sstab, bool reset_level
 // several shards, but we can't start any compaction before all the sstables
 // of this CF were loaded. So call this function to start rewrites, if any.
 void column_family::start_rewrite() {
+    // submit shared sstables in generation order to guarantee that all shards
+    // owning a sstable will agree on its deletion nearly the same time,
+    // therefore, reducing disk space requirements.
+    boost::sort(_sstables_need_rewrite, [] (const sstables::shared_sstable& x, const sstables::shared_sstable& y) {
+        return x->generation() < y->generation();
+    });
    for (auto sst : _sstables_need_rewrite) {
        dblog.info("Splitting {} for shard", sst->get_filename());
        _compaction_manager.submit_sstable_rewrite(this, sst);
@@ -912,10 +917,6 @@ column_family::seal_active_streaming_memtable_delayed() {
        return make_ready_future<>();
    }

-    if (_streaming_memtables->should_flush()) {
-        return seal_active_streaming_memtable_immediate();
-    }
-
    if (!_delayed_streaming_flush.armed()) {
            // We don't want to wait for too long, because the incoming mutations will not be available
            // until we flush them to SSTables. On top of that, if the sender ran out of messages, it won't
@@ -946,8 +947,7 @@ column_family::seal_active_streaming_memtable_immediate() {
        auto current_waiters = std::exchange(_waiting_streaming_flushes, shared_promise<>());
        auto f = current_waiters.get_shared_future(); // for this seal

-        _config.streaming_dirty_memory_manager->serialize_flush([this, old] {
-          return with_lock(_sstables_lock.for_read(), [this, old] {
+        with_lock(_sstables_lock.for_read(), [this, old] {
            auto newtab = make_lw_shared<sstables::sstable>(_schema,
                _config.datadir, calculate_generation_for_new_table(),
                sstables::sstable::version_types::ka,
@@ -980,7 +980,6 @@ column_family::seal_active_streaming_memtable_immediate() {
            });
            // We will also not have any retry logic. If we fail here, we'll fail the streaming and let
            // the upper layers know. They can then apply any logic they want here.
-          });
        }).then_wrapped([this, current_waiters = std::move(current_waiters)] (future <> f) mutable {
            if (f.failed()) {
                current_waiters.set_exception(f.get_exception());
@@ -1044,12 +1043,10 @@ column_family::seal_active_memtable(memtable_list::flush_behavior ignored) {
      _config.cf_stats->pending_memtables_flushes_count++;
      _config.cf_stats->pending_memtables_flushes_bytes += memtable_size;

-      return _config.dirty_memory_manager->serialize_flush([this, old] {
-        return repeat([this, old] {
-            return with_lock(_sstables_lock.for_read(), [this, old] {
-                _flush_queue->check_open_gate();
-                return try_flush_memtable_to_sstable(old);
-            });
+      return repeat([this, old] {
+        return with_lock(_sstables_lock.for_read(), [this, old] {
+            _flush_queue->check_open_gate();
+            return try_flush_memtable_to_sstable(old);
        });
      }).then([this, memtable_size] {
        _config.cf_stats->pending_memtables_flushes_count--;
@@ -1091,6 +1088,24 @@ column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
        return newtab->open_data();
    }).then_wrapped([this, old, newtab] (future<> ret) {
        dblog.debug("Flushing to {} done", newtab->get_filename());
+        // Could pass the iterator to the seal functions, and avoid the need to search the
+        // unordered_map here. But this is supposed to be cheap and it is a lot less clutter in the
+        // method signatures. Also makes it optional and streaming memtables don't have to do it.
+        // Note that the number of entries in this hash is limited by the background flushes
+        // semaphore, so it'll always be small.
+        //
+        // In terms of releasing dirty memory, this is almost as far as we should go. We could do
+        // this right before updating the cache, but from this point on to update_cache we have no
+        // deferring points, so that's fine. We do it in here because if we fail this write it will
+        // try the write again and that will create a new flush reader that will decrease dirty
+        // memory again. So we need to get rid of the charges here anyway for correctness.
+        //
+        // After the cache starts to be updated the region in transferred over. We kind of assume
+        // there will be no deferring point between this and update cache transferring ownership.
+        // It's not that bad if it is so we wouldn't really protect against it, but without a
+        // deferring point we can guarantee that no request will see a spike in dirty memory between
+        // the release of our memory and the execution of a request.
+        dirty_memory_manager::from_region_group(old->region_group()).remove_from_flush_manager(&(old->region()));
        try {
            ret.get();

@@ -1131,15 +1146,15 @@ column_family::start() {

 future<>
 column_family::stop() {
-    _memtables->seal_active_memtable(memtable_list::flush_behavior::immediate);
-    _streaming_memtables->seal_active_memtable(memtable_list::flush_behavior::immediate);
-    return _compaction_manager.remove(this).then([this] {
-        // Nest, instead of using when_all, so we don't lose any exceptions.
-        return _flush_queue->close().then([this] {
-            return _streaming_flush_gate.close();
+    return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
+        return _compaction_manager.remove(this).then([this] {
+            // Nest, instead of using when_all, so we don't lose any exceptions.
+            return _flush_queue->close().then([this] {
+                return _streaming_flush_gate.close();
+            });
+        }).then([this] {
+            return _sstable_deletion_gate.close();
        });
-    }).then([this] {
-        return _sstable_deletion_gate.close();
    });
 }

@@ -1304,7 +1319,17 @@ column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>&
    // Second, delete the old sstables.  This is done in the background, so we can
    // consider this compaction completed.
    seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
-        return sstables::delete_atomically(sstables_to_remove).then([this, sstables_to_remove] {
+        return sstables::delete_atomically(sstables_to_remove).then_wrapped([this, sstables_to_remove] (future<> f) {
+            std::exception_ptr eptr;
+            try {
+                f.get();
+            } catch(...) {
+                eptr = std::current_exception();
+            }
+
+            // unconditionally remove compacted sstables from _sstables_compacted_but_not_deleted,
+            // or they could stay forever in the set, resulting in deleted files remaining
+            // opened and disk space not being released until shutdown.
            std::unordered_set<sstables::shared_sstable> s(
                   sstables_to_remove.begin(), sstables_to_remove.end());
            auto e = boost::range::remove_if(_sstables_compacted_but_not_deleted, [&] (sstables::shared_sstable sst) -> bool {
@@ -1312,6 +1337,11 @@ column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>&
            });
            _sstables_compacted_but_not_deleted.erase(e, _sstables_compacted_but_not_deleted.end());
            rebuild_statistics();
+
+            if (eptr) {
+                return make_exception_future<>(eptr);
+            }
+            return make_ready_future<>();
        }).handle_exception([] (std::exception_ptr e) {
            try {
                std::rethrow_exception(e);
@@ -1626,41 +1656,12 @@ database::database() : database(db::config())
 {}

 database::database(const db::config& cfg)
-    : _cfg(std::make_unique<db::config>(cfg))
-    , _memtable_total_space([this] {
-        _stats = make_lw_shared<db_stats>();
-
-        auto memtable_total_space = size_t(_cfg->memtable_total_space_in_mb()) << 20;
-        if (!memtable_total_space) {
-            return memory::stats().total_memory() / 2;
-        }
-        return memtable_total_space;
-    }())
-    , _streaming_memtable_total_space(_memtable_total_space / 4)
-    // Allow system tables a pool of 10 MB extra memory to write over the threshold. Under normal
-    // circumnstances it won't matter, but when we throttle, some system requests will be able to
-    // keep being serviced even if user requests are not.
-    //
-    // Note that even if we didn't allow extra memory, we would still want to keep system requests
-    // in a different region group. This is because throttled requests are serviced in FIFO order,
-    // and we don't want system requests to be waiting for a long time behind user requests.
-    , _system_dirty_memory_manager(*this, _memtable_total_space + (10 << 20))
-    // The total space that can be used by memtables is _memtable_total_space, but we will only
-    // allow the region_group to grow to half of that. This is because of virtual_dirty: memtables
-    // can take a long time to flush, and if we are using the maximum amount of memory possible,
-    // then requests will block until we finish flushing at least one memtable.
-    //
-    // We can free memory until the whole memtable is flushed because we need to keep it in memory
-    // until the end, but we can fake freeing memory. When we are done with an element of the
-    // memtable, we will update the region group pretending memory just went down by that amount.
-    //
-    // Because the amount of memory that we pretend to free should be close enough to the actual
-    // memory used by the memtables, that effectively creates two sub-regions inside the dirty
-    // region group, of equal size. In the worst case, we will have _memtable_total_space dirty
-    // bytes used, and half of that already virtually freed.
-    , _dirty_memory_manager(*this, &_system_dirty_memory_manager, _memtable_total_space / 2)
-    // The same goes for streaming in respect to virtual dirty.
-    , _streaming_dirty_memory_manager(*this, &_dirty_memory_manager, _streaming_memtable_total_space / 2)
+    : _stats(make_lw_shared<db_stats>())
+    , _cfg(std::make_unique<db::config>(cfg))
+    // Allow system tables a pool of 10 MB memory to write, but never block on other regions.
+    , _system_dirty_memory_manager(*this, 10 << 20)
+    , _dirty_memory_manager(*this, memory::stats().total_memory() * 0.45)
+    , _streaming_dirty_memory_manager(*this, memory::stats().total_memory() * 0.10)
    , _version(empty_version)
    , _enable_incremental_backups(cfg.incremental_backups())
 {
@@ -1670,14 +1671,40 @@ database::database(const db::config& cfg)
    dblog.info("Row: max_vector_size: {}, internal_count: {}", size_t(row::max_vector_size), size_t(row::internal_count));
 }

+
+void
+dirty_memory_manager::setup_collectd(sstring namestr) {
+    _collectd.push_back(
+        scollectd::add_polled_metric(scollectd::type_instance_id("memory"
+                , scollectd::per_cpu_plugin_instance
+                , "bytes", namestr + "_dirty")
+                , scollectd::make_typed(scollectd::data_type::GAUGE, [this] {
+            return real_dirty_memory();
+    })));
+
+    _collectd.push_back(
+        scollectd::add_polled_metric(scollectd::type_instance_id("memory"
+                , scollectd::per_cpu_plugin_instance
+                , "bytes", namestr +"_virtual_dirty")
+                , scollectd::make_typed(scollectd::data_type::GAUGE, [this] {
+            return virtual_dirty_memory();
+    })));
+}
+
 void
 database::setup_collectd() {
+    _dirty_memory_manager.setup_collectd("regular");
+    _system_dirty_memory_manager.setup_collectd("system");
+    _streaming_dirty_memory_manager.setup_collectd("streaming");
+
    _collectd.push_back(
        scollectd::add_polled_metric(scollectd::type_instance_id("memory"
                , scollectd::per_cpu_plugin_instance
                , "bytes", "dirty")
                , scollectd::make_typed(scollectd::data_type::GAUGE, [this] {
-            return _dirty_memory_manager.real_dirty_memory();
+            return _dirty_memory_manager.real_dirty_memory() +
+                   _system_dirty_memory_manager.real_dirty_memory() +
+                   _streaming_dirty_memory_manager.real_dirty_memory();
    })));

    _collectd.push_back(
@@ -1685,7 +1712,9 @@ database::setup_collectd() {
                , scollectd::per_cpu_plugin_instance
                , "bytes", "virtual_dirty")
                , scollectd::make_typed(scollectd::data_type::GAUGE, [this] {
-            return _dirty_memory_manager.virtual_dirty_memory();
+        return _dirty_memory_manager.virtual_dirty_memory() +
+               _system_dirty_memory_manager.virtual_dirty_memory() +
+               _streaming_dirty_memory_manager.virtual_dirty_memory();
    })));

    _collectd.push_back(
@@ -2167,8 +2196,6 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
    cfg.enable_disk_writes = _config.enable_disk_writes;
    cfg.enable_commitlog = _config.enable_commitlog;
    cfg.enable_cache = _config.enable_cache;
-    cfg.max_memtable_size = _config.max_memtable_size;
-    cfg.max_streaming_memtable_size = _config.max_streaming_memtable_size;
    cfg.dirty_memory_manager = _config.dirty_memory_manager;
    cfg.streaming_dirty_memory_manager = _config.streaming_dirty_memory_manager;
    cfg.read_concurrency_config = _config.read_concurrency_config;
@@ -2468,7 +2495,6 @@ column_family::apply(const mutation& m, const db::replay_position& rp) {
    utils::latency_counter lc;
    _stats.writes.set_latency(lc);
    _memtables->active_memtable().apply(m, rp);
-    _memtables->seal_on_overflow();
    _stats.writes.mark(lc);
    if (lc.is_start()) {
        _stats.estimated_write.add(lc.latency(), _stats.writes.hist.count);
@@ -2481,7 +2507,6 @@ column_family::apply(const frozen_mutation& m, const schema_ptr& m_schema, const
    _stats.writes.set_latency(lc);
    check_valid_rp(rp);
    _memtables->active_memtable().apply(m, m_schema, rp);
-    _memtables->seal_on_overflow();
    _stats.writes.mark(lc);
    if (lc.is_start()) {
        _stats.estimated_write.add(lc.latency(), _stats.writes.hist.count);
@@ -2494,7 +2519,6 @@ void column_family::apply_streaming_mutation(schema_ptr m_schema, utils::UUID pl
        return;
    }
    _streaming_memtables->active_memtable().apply(m, m_schema);
-    _streaming_memtables->seal_on_overflow();
 }

 void column_family::apply_streaming_big_mutation(schema_ptr m_schema, utils::UUID plan_id, const frozen_mutation& m) {
@@ -2505,7 +2529,6 @@ void column_family::apply_streaming_big_mutation(schema_ptr m_schema, utils::UUI
    }
    auto entry = it->second;
    entry->memtables->active_memtable().apply(m, m_schema);
-    entry->memtables->seal_on_overflow();
 }

 void
@@ -2517,51 +2540,107 @@ column_family::check_valid_rp(const db::replay_position& rp) const {

 future<> dirty_memory_manager::shutdown() {
    _db_shutdown_requested = true;
-    return _waiting_flush_gate.close().then([this] {
+    _should_flush.signal();
+    return std::move(_waiting_flush).then([this] {
        return _region_group.shutdown();
    });
 }

-void dirty_memory_manager::maybe_do_active_flush() {
-    if (!_db || !under_pressure() || _db_shutdown_requested) {
-        return;
+future<> memtable_list::request_flush() {
+    if (!may_flush()) {
+        return make_ready_future<>();
+    } else if (!_flush_coalescing) {
+        _flush_coalescing = shared_promise<>();
+        return _dirty_memory_manager->get_flush_permit().then([this] (auto permit) {
+            auto current_flush = std::move(*_flush_coalescing);
+            _flush_coalescing = {};
+            return _dirty_memory_manager->flush_one(*this, std::move(permit)).then_wrapped([this, current_flush = std::move(current_flush)] (auto f) mutable {
+                if (f.failed()) {
+                    current_flush.set_exception(f.get_exception());
+                } else {
+                    current_flush.set_value();
+                }
+            });
+        });
+    } else {
+        return _flush_coalescing->get_shared_future();
+    }
+}
+
+future<> dirty_memory_manager::flush_one(memtable_list& mtlist, semaphore_units<> permit) {
+    if (mtlist.back()->empty()) {
+        return make_ready_future<>();
    }

-    // Flush already ongoing. We don't need to initiate an active flush at this moment.
-    if (_flush_serializer.current() == 0) {
-        return;
+    auto* region = &(mtlist.back()->region());
+    auto schema = mtlist.back()->schema();
+
+    add_to_flush_manager(region, std::move(permit));
+    return get_units(_background_work_flush_serializer, 1).then([this, &mtlist, region, schema] (auto permit) mutable {
+        return mtlist.seal_active_memtable(memtable_list::flush_behavior::immediate).then_wrapped([this, region, schema, permit = std::move(permit)] (auto f) {
+            // There are two cases in which we may still need to remove the permits from here.
+            //
+            // 1) Some exception happenend, and we can't know at which point. It could be that because
+            //    of that, the permits are still dangling. We have to remove it.
+            // 2) If we are using a memory-only Column Family. That will never create a memtable
+            //    flush object, and we'll never get rid of the permits. So we have to remove it
+            //    here.
+            this->remove_from_flush_manager(region);
+            if (f.failed()) {
+                dblog.error("Failed to flush memtable, {}:{}", schema->ks_name(), schema->cf_name());
+            }
+            return std::move(f);
+        });
+    });
+}
+
+future<> dirty_memory_manager::flush_when_needed() {
+    if (!_db) {
+        return make_ready_future<>();
    }
-
-    // There are many criteria that can be used to select what is the best memtable to
-    // flush. Most of the time we want some coordination with the commitlog to allow us to
-    // release commitlog segments as early as we can.
-    //
-    // But during pressure condition, we'll just pick the CF that holds the largest
-    // memtable. The advantage of doing this is that this is objectively the one that will
-    // release the biggest amount of memory and is less likely to be generating tiny
-    // SSTables. The disadvantage is that right now, because we only release memory when the
-    // SSTable is fully written, that may take a bit of time to happen.
-    //
-    // However, since we'll very soon have a mechanism in place to account for the memory
-    // that was already written in one form or another, that disadvantage is mitigated.
-    memtable& biggest_memtable = memtable::from_region(*_region_group.get_largest_region());
-    auto& biggest_cf = _db->find_column_family(biggest_memtable.schema());
-    memtable_list& mtlist = get_memtable_list(biggest_cf);
-    // Please note that this will eventually take the semaphore and prevent two concurrent flushes.
-    // We don't need any other extra protection.
-    mtlist.seal_active_memtable(memtable_list::flush_behavior::immediate);
+    // If there are explicit flushes requested, we must wait for them to finish before we stop.
+    return do_until([this] { return _db_shutdown_requested && !_flush_serializer.waiters(); }, [this] {
+        auto has_work = [this] { return _flush_serializer.waiters() || over_soft_limit() || _db_shutdown_requested; };
+        return _should_flush.wait(std::move(has_work)).then([this] {
+            return get_flush_permit().then([this] (auto permit) {
+                // We give priority to explicit flushes. They are mainly user-initiated flushes,
+                // flushes coming from a DROP statement, or commitlog flushes.
+                if (_flush_serializer.waiters()) {
+                    return make_ready_future<>();
+                }
+                // condition abated while we waited for the semaphore
+                if (!this->over_soft_limit() || _db_shutdown_requested) {
+                    return make_ready_future<>();
+                }
+                // There are many criteria that can be used to select what is the best memtable to
+                // flush. Most of the time we want some coordination with the commitlog to allow us to
+                // release commitlog segments as early as we can.
+                //
+                // But during pressure condition, we'll just pick the CF that holds the largest
+                // memtable. The advantage of doing this is that this is objectively the one that will
+                // release the biggest amount of memory and is less likely to be generating tiny
+                // SSTables.
+                memtable& candidate_memtable = memtable::from_region(*(this->_region_group.get_largest_region()));
+                dirty_memory_manager* candidate_dirty_manager = &(dirty_memory_manager::from_region_group(candidate_memtable.region_group()));
+                // Do not wait. The semaphore will protect us against a concurrent flush. But we
+                // want to start a new one as soon as the permits are destroyed and the semaphore is
+                // made ready again, not when we are done with the current one.
+                candidate_dirty_manager->flush_one(*(candidate_memtable.get_memtable_list()), std::move(permit));
+                return make_ready_future<>();
+            });
+        });
+    }).finally([this] {
+        // We'll try to acquire the permit here to make sure we only really stop when there are no
+        // in-flight flushes. Our stop condition checks for the presence of waiters, but it could be
+        // that we have no waiters, but a flush still in flight. We wait for all background work to
+        // stop. When that stops, we know that the foreground work in the _flush_serializer has
+        // stopped as well.
+        return get_units(_background_work_flush_serializer, _max_background_work);
+    });
 }

-memtable_list& memtable_dirty_memory_manager::get_memtable_list(column_family& cf) {
-    return *(cf._memtables);
-}
-
-memtable_list& streaming_dirty_memory_manager::get_memtable_list(column_family& cf) {
-    return *(cf._streaming_memtables);
-}
-
-void dirty_memory_manager::start_reclaiming() {
-    maybe_do_active_flush();
+void dirty_memory_manager::start_reclaiming() noexcept {
+    _should_flush.signal();
 }

 future<> database::apply_in_memory(const frozen_mutation& m, schema_ptr m_schema, db::replay_position rp) {
@@ -2637,10 +2716,6 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        cfg.enable_disk_reads = true; // we allways read from disk
        cfg.enable_commitlog = ksm.durable_writes() && _cfg->enable_commitlog() && !_cfg->enable_in_memory_data_store();
        cfg.enable_cache = _cfg->enable_cache();
-        cfg.max_memtable_size = _memtable_total_space * _cfg->memtable_cleanup_threshold();
-        // We should guarantee that at least two memtable are available, otherwise after flush, adding another memtable would
-        // easily take us into throttling until the first one is flushed.
-        cfg.max_streaming_memtable_size = std::min(cfg.max_memtable_size, _streaming_memtable_total_space / 2);

    } else {
        cfg.datadir = "";
@@ -2648,9 +2723,6 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        cfg.enable_disk_reads = false;
        cfg.enable_commitlog = false;
        cfg.enable_cache = false;
-        cfg.max_memtable_size = std::numeric_limits<size_t>::max();
-        // All writes should go to the main memtable list if we're not durable
-        cfg.max_streaming_memtable_size = 0;
    }
    cfg.dirty_memory_manager = &_dirty_memory_manager;
    cfg.streaming_dirty_memory_manager = &_streaming_dirty_memory_manager;
@@ -3097,21 +3169,17 @@ future<std::unordered_map<sstring, column_family::snapshot_details>> column_fami
 future<> column_family::flush() {
    _stats.pending_flushes++;

-    auto fut = _memtables->seal_active_memtable(memtable_list::flush_behavior::immediate);
-    // this rp is either:
-    // a.) Done - no-op
-    // b.) Ours
-    // c.) The last active flush not finished. If our latest memtable is
-    //     empty it still makes sense for this api call to wait for this.
-    auto high_rp = _highest_flushed_rp;
-
-    return fut.finally([this, high_rp] {
+    // highest_flushed_rp is only updated when we flush. If the memtable is currently alive, then
+    // the most up2date replay position is the one that's in there now. Otherwise, if the memtable
+    // hasn't received any writes yet, that's the one from the last flush we made.
+    auto desired_rp = _memtables->back()->empty() ? _highest_flushed_rp : _memtables->back()->replay_position();
+    return _memtables->request_flush().finally([this, desired_rp] {
        _stats.pending_flushes--;
        // In origin memtable_switch_count is incremented inside
        // ColumnFamilyMeetrics Flush.run
        _stats.memtable_switch_count++;
        // wait for all up until us.
-        return _flush_queue->wait_for_pending(high_rp);
+        return _flush_queue->wait_for_pending(desired_rp);
    });
 }

@@ -3128,7 +3196,7 @@ future<> column_family::flush(const db::replay_position& pos) {
    // We ignore this for now and just say that if we're asked for
    // a CF and it exists, we pretty much have to have data that needs
    // flushing. Let's do it.
-    return _memtables->seal_active_memtable(memtable_list::flush_behavior::immediate);
+    return _memtables->request_flush();
 }

 // FIXME: We can do much better than this in terms of cache management. Right
@@ -3169,7 +3237,7 @@ future<> column_family::flush_streaming_big_mutations(utils::UUID plan_id) {
    }
    auto entry = it->second;
    _streaming_memtables_big.erase(it);
-    return entry->memtables->seal_active_memtable(memtable_list::flush_behavior::immediate).then([entry] {
+    return entry->memtables->request_flush().then([entry] {
        return entry->flush_in_progress.close();
    }).then([this, entry] {
        return parallel_for_each(entry->sstables, [this] (auto& sst) {
--- a/database.hh
+++ b/database.hh
@@ -119,28 +119,93 @@ class dirty_memory_manager: public logalloc::region_group_reclaimer {
    // throttled for a long time. Even when we have virtual dirty, that only provides a rough
    // estimate, and we can't release requests that early.
    semaphore _flush_serializer;
+    // We will accept a new flush before another one ends, once it is done with the data write.
+    // That is so we can keep the disk always busy. But there is still some background work that is
+    // left to be done. Mostly, update the caches and seal the auxiliary components of the SSTable.
+    // This semaphore will cap the amount of background work that we have. Note that we're not
+    // overly concerned about memtable memory, because dirty memory will put a limit to that. This
+    // is mostly about dangling continuations. So that doesn't have to be a small number.
+    static constexpr unsigned _max_background_work = 20;
+    semaphore _background_work_flush_serializer = { _max_background_work };
+    condition_variable _should_flush;
    int64_t _dirty_bytes_released_pre_accounted = 0;

-    seastar::gate _waiting_flush_gate;
-    std::vector<shared_memtable> _pending_flushes;
-    void maybe_do_active_flush();
-protected:
-    virtual memtable_list& get_memtable_list(column_family& cf) = 0;
-    virtual void start_reclaiming() override;
+    future<> flush_when_needed();
+    // We need to start a flush before the current one finishes, otherwise
+    // we'll have a period without significant disk activity when the current
+    // SSTable is being sealed, the caches are being updated, etc. To do that
+    // we need to keep track of who is it that we are flushing this memory from.
+    struct flush_token {
+        dirty_memory_manager* _dirty_memory_manager;
+        size_t _freed_memory = 0;
+        semaphore_units<> _sem;
+    public:
+        flush_token(dirty_memory_manager *dm, semaphore_units<>&& s) : _dirty_memory_manager(dm), _sem(std::move(s)) {}
+        void mark_end_flush(size_t freed) {
+            auto destroy = std::move(_sem);
+            _freed_memory = freed;
+        }
+        ~flush_token() {
+            _dirty_memory_manager->_region_group.update(_freed_memory);
+            _dirty_memory_manager->_dirty_bytes_released_pre_accounted -= _freed_memory;
+        }
+    };
+    friend class flush_token;
+    std::unordered_map<const logalloc::region*, flush_token> _flush_manager;
+
+    future<> _waiting_flush;
+    virtual void start_reclaiming() noexcept override;
+
+    std::vector<scollectd::registration> _collectd;
 public:
+    void setup_collectd(sstring namestr);
+
    future<> shutdown();

-    dirty_memory_manager(database* db, size_t threshold)
-                                           : logalloc::region_group_reclaimer(threshold)
-                                           , _db(db)
-                                           , _region_group(*this)
-                                           , _flush_serializer(1) {}
+    // Limits and pressure conditions:
+    // ===============================
+    //
+    // Virtual Dirty
+    // -------------
+    // We can't free memory until the whole memtable is flushed because we need to keep it in memory
+    // until the end, but we can fake freeing memory. When we are done with an element of the
+    // memtable, we will update the region group pretending memory just went down by that amount.
+    //
+    // Because the amount of memory that we pretend to free should be close enough to the actual
+    // memory used by the memtables, that effectively creates two sub-regions inside the dirty
+    // region group, of equal size. In the worst case, we will have <memtable_total_space> dirty
+    // bytes used, and half of that already virtually freed.
+    //
+    // Hard Limit
+    // ----------
+    // The total space that can be used by memtables in each group is defined by the threshold, but
+    // we will only allow the region_group to grow to half of that. This is because of virtual_dirty
+    // as explained above. Because virtual dirty is implemented by reducing the usage in the
+    // region_group directly on partition written, we want to throttle every time half of the memory
+    // as seen by the region_group. To achieve that we need to set the hard limit (first parameter
+    // of the region_group_reclaimer) to 1/2 of the user-supplied threshold
+    //
+    // Soft Limit
+    // ----------
+    // When the soft limit is hit, no throttle happens. The soft limit exists because we don't want
+    // to start flushing only when the limit is hit, but a bit earlier instead. If we were to start
+    // flushing only when the hard limit is hit, workloads in which the disk is fast enough to cope
+    // would see latency added to some requests unnecessarily.
+    //
+    // We then set the soft limit to 80 % of the virtual dirty hard limit, which is equal to 40 % of
+    // the user-supplied threshold.
+    dirty_memory_manager(database& db, size_t threshold)
+        : logalloc::region_group_reclaimer(threshold / 2, threshold * 0.40)
+        , _db(&db)
+        , _region_group(*this)
+        , _flush_serializer(1)
+        , _waiting_flush(flush_when_needed()) {}

-    dirty_memory_manager(database* db, dirty_memory_manager *parent, size_t threshold)
-                                                                         : logalloc::region_group_reclaimer(threshold)
-                                                                         , _db(db)
-                                                                         , _region_group(&parent->_region_group, *this)
-                                                                         , _flush_serializer(1) {}
+    dirty_memory_manager() : logalloc::region_group_reclaimer()
+        , _db(nullptr)
+        , _region_group(*this)
+        , _flush_serializer(1)
+        , _waiting_flush(make_ready_future<>()) {}

    static dirty_memory_manager& from_region_group(logalloc::region_group *rg) {
        return *(boost::intrusive::get_parent_from_member(rg, &dirty_memory_manager::_region_group));
@@ -154,16 +219,44 @@ public:
        return _region_group;
    }

-    void revert_potentially_cleaned_up_memory(int64_t delta) {
-        _region_group.update(delta);
-        _dirty_bytes_released_pre_accounted -= delta;
+    void revert_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
+        // Flushed the current memtable. There is still some work to do, like finish sealing the
+        // SSTable and updating the cache, but we can already allow the next one to start.
+        //
+        // By erasing this memtable from the flush_manager we'll destroy the semaphore_units
+        // associated with this flush and will allow another one to start. We'll signal the
+        // condition variable to let them know we might be ready early.
+        auto it = _flush_manager.find(from);
+        if (it != _flush_manager.end()) {
+            it->second.mark_end_flush(delta);
+        }
    }

-    void account_potentially_cleaned_up_memory(int64_t delta) {
+    void account_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
        _region_group.update(-delta);
        _dirty_bytes_released_pre_accounted += delta;
    }

+    // This can be called multiple times during the lifetime of the region, and should always
+    // ultimately be called after the flush ends. However, some flushers may decide to call it
+    // earlier. For instance, the normal memtables sealing function will call this before updating
+    // the cache.
+    //
+    // Also, for sealing methods like the normal memtable sealing method - that may retry after a
+    // failed write, calling this method after the attempt is completed with success or failure is
+    // mandatory. That's because the new attempt will create a new flush reader for the same
+    // SSTable, so we need to make sure that we revert the old charges.
+    void remove_from_flush_manager(const logalloc::region *region) {
+        auto it = _flush_manager.find(region);
+        if (it != _flush_manager.end()) {
+            _flush_manager.erase(it);
+        }
+    }
+
+    void add_to_flush_manager(const logalloc::region *region, semaphore_units<>&& permit) {
+        _flush_manager.emplace(std::piecewise_construct, std::make_tuple(region), std::make_tuple(this, std::move(permit)));
+    }
+
    size_t real_dirty_memory() const {
        return _region_group.memory_used() + _dirty_bytes_released_pre_accounted;
    }
@@ -172,33 +265,14 @@ public:
        return _region_group.memory_used();
    }

-    template <typename Func>
-    future<> serialize_flush(Func&& func) {
-        return seastar::with_gate(_waiting_flush_gate,  [this, func] () mutable {
-            return with_semaphore(_flush_serializer, 1, func).finally([this] {
-                maybe_do_active_flush();
-            });
-        });
+    future<> flush_one(memtable_list& cf, semaphore_units<> permit);
+
+    future<semaphore_units<>> get_flush_permit() {
+        return get_units(_flush_serializer, 1);
    }
 };

-class streaming_dirty_memory_manager: public dirty_memory_manager {
-    virtual memtable_list& get_memtable_list(column_family& cf) override;
-public:
-    streaming_dirty_memory_manager(database& db, dirty_memory_manager *parent, size_t threshold) : dirty_memory_manager(&db, parent, threshold) {}
-};
-
-class memtable_dirty_memory_manager: public dirty_memory_manager {
-    virtual memtable_list& get_memtable_list(column_family& cf) override;
-public:
-    memtable_dirty_memory_manager(database& db, dirty_memory_manager* parent, size_t threshold) : dirty_memory_manager(&db, parent, threshold) {}
-    // This constructor will be called for the system tables (no parent). Its flushes are usually drive by us
-    // and not the user, and tend to be small in size. So we'll allow only two slots.
-    memtable_dirty_memory_manager(database& db, size_t threshold) : dirty_memory_manager(&db, threshold) {}
-    memtable_dirty_memory_manager() : dirty_memory_manager(nullptr, std::numeric_limits<size_t>::max()) {}
-};
-
-extern thread_local memtable_dirty_memory_manager default_dirty_memory_manager;
+extern thread_local dirty_memory_manager default_dirty_memory_manager;

 // We could just add all memtables, regardless of types, to a single list, and
 // then filter them out when we read them. Here's why I have chosen not to do
@@ -225,18 +299,29 @@ private:
    std::vector<shared_memtable> _memtables;
    std::function<future<> (flush_behavior)> _seal_fn;
    std::function<schema_ptr()> _current_schema;
-    size_t _max_memtable_size;
    dirty_memory_manager* _dirty_memory_manager;
+    std::experimental::optional<shared_promise<>> _flush_coalescing;
 public:
-    memtable_list(std::function<future<> (flush_behavior)> seal_fn, std::function<schema_ptr()> cs, size_t max_memtable_size, dirty_memory_manager* dirty_memory_manager)
+    memtable_list(std::function<future<> (flush_behavior)> seal_fn, std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
        : _memtables({})
        , _seal_fn(seal_fn)
        , _current_schema(cs)
-        , _max_memtable_size(max_memtable_size)
        , _dirty_memory_manager(dirty_memory_manager) {
        add_memtable();
    }

+    memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
+        : _memtables({})
+        , _seal_fn()
+        , _current_schema(cs)
+        , _dirty_memory_manager(dirty_memory_manager) {
+        add_memtable();
+    }
+
+    bool may_flush() const {
+        return bool(_seal_fn);
+    }
+
    shared_memtable back() {
        return _memtables.back();
    }
@@ -281,20 +366,17 @@ public:
        _memtables.emplace_back(new_memtable());
    }

-    bool should_flush() {
-        return active_memtable().occupancy().total_space() >= _max_memtable_size;
-    }
-
-    void seal_on_overflow() {
-        if (should_flush()) {
-            // FIXME: if sparse, do some in-memory compaction first
-            // FIXME: maybe merge with other in-memory memtables
-            seal_active_memtable(flush_behavior::immediate);
-        }
+    logalloc::region_group& region_group() {
+        return _dirty_memory_manager->region_group();
    }
+    // This is used for explicit flushes. Will queue the memtable for flushing and proceed when the
+    // dirty_memory_manager allows us to. We will not seal at this time since the flush itself
+    // wouldn't happen anyway. Keeping the memtable in memory will potentially increase the time it
+    // spends in memory allowing for more coalescing opportunities.
+    future<> request_flush();
 private:
    lw_shared_ptr<memtable> new_memtable() {
-        return make_lw_shared<memtable>(_current_schema(), &(_dirty_memory_manager->region_group()));
+        return make_lw_shared<memtable>(_current_schema(), this);
    }
 };

@@ -328,8 +410,6 @@ public:
        bool enable_cache = true;
        bool enable_commitlog = true;
        bool enable_incremental_backups = false;
-        size_t max_memtable_size = 5'000'000;
-        size_t max_streaming_memtable_size = 5'000'000;
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
        restricted_mutation_reader_config read_concurrency_config;
@@ -388,9 +468,6 @@ private:
    lw_shared_ptr<memtable_list> _streaming_memtables;
    utils::phased_barrier _streaming_flush_phaser;

-    friend class memtable_dirty_memory_manager;
-    friend class streaming_dirty_memory_manager;
-
    // If mutations are fragmented during streaming the sstables cannot be made
    // visible immediately after memtable flush, because that could cause
    // readers to see only a part of a partition thus violating isolation
@@ -751,7 +828,7 @@ private:
    // repair can now choose whatever strategy - small or big ranges - it wants, resting assure
    // that the incoming memtables will be coalesced together.
    shared_promise<> _waiting_streaming_flushes;
-    timer<> _delayed_streaming_flush{[this] { seal_active_streaming_memtable_immediate(); }};
+    timer<> _delayed_streaming_flush{[this] { _streaming_memtables->request_flush(); }};
    future<> seal_active_streaming_memtable_delayed();
    future<> seal_active_streaming_memtable_immediate();
    future<> seal_active_streaming_memtable(memtable_list::flush_behavior behavior) {
@@ -882,8 +959,6 @@ public:
        bool enable_disk_writes = true;
        bool enable_cache = true;
        bool enable_incremental_backups = false;
-        size_t max_memtable_size = 5'000'000;
-        size_t max_streaming_memtable_size = 5'000'000;
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
        restricted_mutation_reader_config read_concurrency_config;
@@ -978,11 +1053,11 @@ class database {
    lw_shared_ptr<db_stats> _stats;

    std::unique_ptr<db::config> _cfg;
-    size_t _memtable_total_space = 500 << 20;
-    size_t _streaming_memtable_total_space = 500 << 20;
-    memtable_dirty_memory_manager _system_dirty_memory_manager;
-    memtable_dirty_memory_manager _dirty_memory_manager;
-    streaming_dirty_memory_manager _streaming_dirty_memory_manager;
+
+    dirty_memory_manager _system_dirty_memory_manager;
+    dirty_memory_manager _dirty_memory_manager;
+    dirty_memory_manager _streaming_dirty_memory_manager;
+
    semaphore _read_concurrency_sem{max_concurrent_reads()};
    restricted_mutation_reader_config _read_concurrency_config;
    semaphore _system_read_concurrency_sem{max_system_concurrent_reads()};
--- a/database_fwd.hh
+++ b/database_fwd.hh
@@ -23,6 +23,7 @@

 // database.hh
 class database;
+class memtable_list;

 // mutation.hh
 class mutation;
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -58,6 +58,8 @@
 #include <core/fstream.hh>
 #include <seastar/core/memory.hh>
 #include <seastar/core/chunked_fifo.hh>
+#include <seastar/core/queue.hh>
+#include <seastar/core/sleep.hh>
 #include <net/byteorder.hh>

 #include "commitlog.hh"
@@ -78,6 +80,8 @@

 static logging::logger logger("commitlog");

+using namespace std::chrono_literals;
+
 class crc32_nbo {
    crc32 _c;
 public:
@@ -157,6 +161,7 @@ const std::string db::commitlog::descriptor::FILENAME_EXTENSION(".log");
 class db::commitlog::segment_manager : public ::enable_shared_from_this<segment_manager> {
 public:
    config cfg;
+    std::vector<sstring> _segments_to_replay;
    const uint64_t max_size;
    const uint64_t max_mutation_size;
    // Divide the size-on-disk threshold by #cpus used, since we assume
@@ -164,6 +169,7 @@ public:
    const uint64_t max_disk_size; // per-shard

    bool _shutdown = false;
+    std::experimental::optional<shared_promise<>> _shutdown_promise = {};

    semaphore _new_segment_semaphore {1};
    semaphore _flush_semaphore;
@@ -252,7 +258,7 @@ public:

    scollectd::registrations create_counters();

-    void orphan_all();
+    future<> orphan_all();

    void discard_unused_segments();
    void discard_completed_segments(const cf_id_type& id,
@@ -288,21 +294,19 @@ public:
    void flush_segments(bool = false);

 private:
+    future<> clear_reserve_segments();
+
+    size_t max_request_controller_units() const;
    segment_id_type _ids = 0;
    std::vector<sseg_ptr> _segments;
-    std::deque<sseg_ptr> _reserve_segments;
+    queue<sseg_ptr> _reserve_segments;
    std::vector<buffer_type> _temp_buffers;
    std::unordered_map<flush_handler_id, flush_handler> _flush_handlers;
    flush_handler_id _flush_ids = 0;
    replay_position _flush_position;
    timer<clock_type> _timer;
-    size_t _reserve_allocating = 0;
-    // # segments to try to keep available in reserve
-    // i.e. the amount of segments we expect to consume inbetween timer
-    // callbacks.
-    // The idea is that since the files are 0 len at start, and thus cost little,
-    // it is easier to adapt this value compared to timer freq.
-    size_t _num_reserve_segments = 0;
+    future<> replenish_reserve();
+    future<> _reserve_replenisher;
    seastar::gate _gate;
    uint64_t _new_counter = 0;
 };
@@ -870,7 +874,7 @@ db::commitlog::segment_manager::allocate_when_possible(const cf_id_type& id, sha
    }

    auto fut = get_units(_request_controller, size);
-    if (!fut.available()) {
+    if (_request_controller.waiters()) {
        totals.requests_blocked_memory++;
    }
    return fut.then([this, id, writer = std::move(writer)] (auto permit) mutable {
@@ -911,7 +915,9 @@ db::commitlog::segment_manager::segment_manager(config c)
    // an existing in-flight buffer. Since we'll force the cycling() of any buffer that is bigger
    // than default_size at the end of the allocation, that allows for every valid mutation to
    // always be admitted for processing.
-    , _request_controller(max_mutation_size + db::commitlog::segment::default_size)
+    , _request_controller(max_request_controller_units())
+    , _reserve_segments(1)
+    , _reserve_replenisher(make_ready_future<>())
 {
    assert(max_size > 0);

@@ -922,6 +928,32 @@ db::commitlog::segment_manager::segment_manager(config c)
    _regs = create_counters();
 }

+size_t db::commitlog::segment_manager::max_request_controller_units() const {
+    return max_mutation_size + db::commitlog::segment::default_size;
+}
+
+future<> db::commitlog::segment_manager::replenish_reserve() {
+    return do_until([this] { return _shutdown; }, [this] {
+        return _reserve_segments.not_full().then([this] {
+            if (_shutdown) {
+                return make_ready_future<>();
+            }
+            return with_gate(_gate, [this] {
+                return this->allocate_segment(false).then([this](sseg_ptr s) {
+                    auto ret = _reserve_segments.push(std::move(s));
+                    if (!ret) {
+                        logger.error("Segment reserve is full! Ignoring and trying to continue, but shouldn't happen");
+                    }
+                    return make_ready_future<>();
+                });
+            }).handle_exception([](std::exception_ptr ep) {
+                logger.warn("Exception in segment reservation: {}", ep);
+                return sleep(100ms);
+            });
+        });
+    });
+}
+
 future<std::vector<db::commitlog::descriptor>>
 db::commitlog::segment_manager::list_descriptors(sstring dirname) {
    struct helper {
@@ -981,9 +1013,11 @@ db::commitlog::segment_manager::list_descriptors(sstring dirname) {

 future<> db::commitlog::segment_manager::init() {
    return list_descriptors(cfg.commit_log_location).then([this](std::vector<descriptor> descs) {
+        assert(_reserve_segments.empty()); // _segments_to_replay must not pick them up
        segment_id_type id = std::chrono::duration_cast<std::chrono::milliseconds>(runtime::get_boot_time().time_since_epoch()).count() + 1;
        for (auto& d : descs) {
            id = std::max(id, replay_position(d.id).base_id());
+            _segments_to_replay.push_back(cfg.commit_log_location + "/" + d.filename());
        }

        // base id counter is [ <shard> | <base> ]
@@ -992,6 +1026,9 @@ future<> db::commitlog::segment_manager::init() {
        _timer.set_callback(std::bind(&segment_manager::on_timer, this));
        auto delay = engine().cpu_id() * std::ceil(double(cfg.commitlog_sync_period_in_ms) / smp::count);
        logger.trace("Delaying timer loop {} ms", delay);
+        // We need to wait until we have scanned all other segments to actually start serving new
+        // segments. We are ready now
+        this->_reserve_replenisher = replenish_reserve();
        this->arm(delay);
    });
 }
@@ -1139,22 +1176,15 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:

    ++_new_counter;

-    if (_reserve_segments.empty()) {
-        if (_num_reserve_segments < cfg.max_reserve_segments) {
-            ++_num_reserve_segments;
-            logger.trace("Increased segment reserve count to {}", _num_reserve_segments);
-        }
-        return allocate_segment(true).then([this](sseg_ptr s) {
-            _segments.push_back(s);
-            return make_ready_future<sseg_ptr>(s);
-        });
+    if (_reserve_segments.empty() && (_reserve_segments.max_size() < cfg.max_reserve_segments)) {
+        _reserve_segments.set_max_size(_reserve_segments.max_size() + 1);
+        logger.debug("Increased segment reserve count to {}", _reserve_segments.max_size());
    }
-
-    _segments.push_back(_reserve_segments.front());
-    _reserve_segments.pop_front();
-    _segments.back()->reset_sync_time();
-    logger.trace("Acquired segment {} from reserve", _segments.back());
-    return make_ready_future<sseg_ptr>(_segments.back());
+    return _reserve_segments.pop_eventually().then([this] (auto s) {
+        _segments.push_back(std::move(s));
+        _segments.back()->reset_sync_time();
+        return make_ready_future<sseg_ptr>(_segments.back());
+    });
 }

 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::active_segment() {
@@ -1222,6 +1252,15 @@ void db::commitlog::segment_manager::discard_unused_segments() {
    }
 }

+// FIXME: pop() will call unlink -> sleeping in reactor thread.
+// Not urgent since mostly called during shutdown, but have to fix.
+future<> db::commitlog::segment_manager::clear_reserve_segments() {
+    while (!_reserve_segments.empty()) {
+        _reserve_segments.pop();
+    }
+    return make_ready_future<>();
+}
+
 future<> db::commitlog::segment_manager::sync_all_segments(bool shutdown) {
    logger.debug("Issuing sync for all segments");
    return parallel_for_each(_segments, [this, shutdown](sseg_ptr s) {
@@ -1232,19 +1271,40 @@ future<> db::commitlog::segment_manager::sync_all_segments(bool shutdown) {
 }

 future<> db::commitlog::segment_manager::shutdown() {
-    if (!_shutdown) {
-        _shutdown = true; // no re-arm, no create new segments.
-        _timer.cancel(); // no more timer calls
-        // Now first wait for periodic task to finish, then sync and close all
-        // segments, flushing out any remaining data.
-        return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true));
+    if (!_shutdown_promise) {
+        _shutdown_promise = shared_promise<>();
+
+        // Wait for all pending requests to finish. Need to sync first because segments that are
+        // alive may be holding semaphore permits.
+        auto block_new_requests = get_units(_request_controller, max_request_controller_units());
+        return sync_all_segments(false).then([this, block_new_requests = std::move(block_new_requests)] () mutable {
+            return std::move(block_new_requests).then([this] (auto permits) {
+                _timer.cancel(); // no more timer calls
+                _shutdown = true; // no re-arm, no create new segments.
+                // Now first wait for periodic task to finish, then sync and close all
+                // segments, flushing out any remaining data.
+                return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true));
+            });
+        }).finally([this] {
+            // Now that the gate is closed and requests completed we are sure nobody else will pop()
+            return clear_reserve_segments().finally([this] {
+                return std::move(_reserve_replenisher).then_wrapped([this] (auto f) {
+                    // Could be cleaner with proper seastar support
+                    if (f.failed()) {
+                        _shutdown_promise->set_exception(f.get_exception());
+                    } else {
+                        _shutdown_promise->set_value();
+                    }
+                });
+            });
+        });
    }
-    return make_ready_future<>();
+    return _shutdown_promise->get_shared_future();
 }

-void db::commitlog::segment_manager::orphan_all() {
+future<> db::commitlog::segment_manager::orphan_all() {
    _segments.clear();
-    _reserve_segments.clear();
+    return clear_reserve_segments();
 }

 /*
@@ -1259,7 +1319,7 @@ future<> db::commitlog::segment_manager::clear() {
        for (auto& s : _segments) {
            s->mark_clean();
        }
-        orphan_all();
+        return orphan_all();
    });
 }
 /**
@@ -1290,37 +1350,7 @@ void db::commitlog::segment_manager::on_timer() {
                flush_segments();
            }
        }
-        // take outstanding allocations into regard. This is paranoid,
-        // but if for some reason the file::open takes longer than timer period,
-        // we could flood the reserve list with new segments
-        //
-        // #482 - _reserve_allocating is decremented in the finally clause below.
-        // This is needed because if either allocate_segment _or_ emplacing into
-        // _reserve_segments should throw, we still need the counter reset
-        // However, because of this, it might be that emplace was done, but not decrement,
-        // when we get here again. So occasionally we might get a sum of the two that is
-        // not consistent. It should however always just potentially be _to much_, i.e.
-        // just an indicator that we don't need to do anything. So lets do that.
-        auto n = std::min(_reserve_segments.size() + _reserve_allocating, _num_reserve_segments);
-        return parallel_for_each(boost::irange(n, _num_reserve_segments), [this, n](auto i) {
-            ++_reserve_allocating;
-            return this->allocate_segment(false).then([this](sseg_ptr s) {
-                if (!_shutdown) {
-                    // insertion sort.
-                    auto i = std::upper_bound(_reserve_segments.begin(), _reserve_segments.end(), s, [](sseg_ptr s1, sseg_ptr s2) {
-                        const descriptor& d1 = s1->_desc;
-                        const descriptor& d2 = s2->_desc;
-                        return d1.id < d2.id;
-                    });
-                    i = _reserve_segments.emplace(i, std::move(s));
-                    logger.trace("Added reserve segment {}", *i);
-                }
-            }).finally([this] {
-                --_reserve_allocating;
-            });
-        });
-    }).handle_exception([](std::exception_ptr ep) {
-        logger.warn("Exception in segment reservation: {}", ep);
+        return make_ready_future<>();
    });
    arm();
 }
@@ -1538,6 +1568,15 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
 subscription<temporary_buffer<char>, db::replay_position>
 db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type off) {
    struct work {
+    private:
+        file_input_stream_options make_file_input_stream_options() {
+            file_input_stream_options fo;
+            fo.buffer_size = db::commitlog::segment::default_size;
+            fo.read_ahead = 10;
+            fo.io_priority_class = service::get_local_commitlog_priority();
+            return fo;
+        }
+    public:
        file f;
        stream<temporary_buffer<char>, replay_position> s;
        input_stream<char> fin;
@@ -1553,7 +1592,7 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
        bool header = true;

        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f)), start_off(o) {
+                : f(f), fin(make_file_input_stream(f, o, make_file_input_stream_options())), start_off(o) {
        }
        work(work&&) = default;

@@ -1736,6 +1775,8 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
                      throw segment_data_corruption_error("Data corruption", corrupt_size);
                  }
                });
+            }).finally([this] {
+                return fin.close();
            });
        }
    };
@@ -1822,3 +1863,6 @@ future<std::vector<sstring>> db::commitlog::list_existing_segments(const sstring
    });
 }

+std::vector<sstring> db::commitlog::get_segments_to_replay() {
+    return std::move(_segment_manager->_segments_to_replay);
+}
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -241,6 +241,14 @@ public:
     */
    std::vector<sstring> get_active_segment_names() const;

+    /**
+     * Returns a vector of segment paths which were
+     * preexisting when this instance of commitlog was created.
+     *
+     * The list will be empty when called for the second time.
+     */
+    std::vector<sstring> get_segments_to_replay();
+
    uint64_t get_total_size() const;
    uint64_t get_completed_tasks() const;
    uint64_t get_flush_count() const;
--- a/db/config.hh
+++ b/db/config.hh
@@ -256,7 +256,7 @@ public:
            "Log a warning when compacting partitions larger than this value"   \
    )                                               \
    /* Common memtable settings */  \
-    val(memtable_total_space_in_mb, uint32_t, 0, Used,     \
+    val(memtable_total_space_in_mb, uint32_t, 0, Invalid,     \
            "Specifies the total memory used for all memtables on a node. This replaces the per-table storage settings memtable_operations_in_millions and memtable_throughput_in_mb."  \
    )                                                   \
    /* Common disk settings */  \
@@ -334,7 +334,7 @@ public:
            "\toffheap_buffers  Off heap (direct) NIO buffers.\n"   \
            "\toffheap_objects  Native memory, eliminating NIO buffer heap overhead."   \
    )                                                   \
-    val(memtable_cleanup_threshold, double, .11, Used, \
+    val(memtable_cleanup_threshold, double, .11, Invalid, \
            "Ratio of occupied non-flushing memtable size to total permitted size for triggering a flush of the largest memtable. Larger values mean larger flushes and less compaction, but also less concurrent flush activity, which can make it difficult to keep your disks saturated under heavy write load." \
    )   \
    val(file_cache_size_in_mb, uint32_t, 512, Unused,  \
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1022,6 +1022,10 @@ std::vector<schema_ptr> all_tables() {
    return r;
 }

+static bool maybe_write_in_user_memory(schema_ptr s, database& db) {
+    return (s.get() == batchlog().get());
+}
+
 void make(database& db, bool durable, bool volatile_testing_only) {
    auto ksm = make_lw_shared<keyspace_metadata>(NAME,
            "org.apache.cassandra.locator.LocalStrategy",
@@ -1045,7 +1049,11 @@ void make(database& db, bool durable, bool volatile_testing_only) {
    db.add_keyspace(NAME, std::move(_ks));
    auto& ks = db.find_keyspace(NAME);
    for (auto&& table : all_tables()) {
-        db.add_column_family(table, ks.make_column_family_config(*table, db.get_config()));
+        auto cfg = ks.make_column_family_config(*table, db.get_config());
+        if (maybe_write_in_user_memory(table, db)) {
+            cfg.dirty_memory_manager = &db._dirty_memory_manager;
+        }
+        db.add_column_family(table, std::move(cfg));
    }
 }

--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -8,7 +8,9 @@ fi
 print_usage() {
    echo "build_ami.sh --localrpm --repo [URL]"
    echo "  --localrpm  deploy locally built rpms"
-    echo "  --repo  specify .repo/.list file URL"
+    echo "  --repo  repository for both install and update, specify .repo/.list file URL"
+    echo "  --repo-for-install  repository for install, specify .repo/.list file URL"
+    echo "  --repo-for-update  repository for update, specify .repo/.list file URL"
    exit 1
 }
 LOCALRPM=0
@@ -24,6 +26,14 @@ while [ $# -gt 0 ]; do
            INSTALL_ARGS="$INSTALL_ARGS --repo $2"
            shift 2
            ;;
+        "--repo-for-install")
+            INSTALL_ARGS="$INSTALL_ARGS --repo-for-install $2"
+            shift 2
+            ;;
+        "--repo-for-update")
+            INSTALL_ARGS="$INSTALL_ARGS --repo-for-update $2"
+            shift 2
+            ;;
        *)
            print_usage
            ;;
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/scripts/scylla_kernel_check
+++ b/dist/common/scripts/scylla_kernel_check
@@ -30,6 +30,6 @@ else
    else
        echo "Please upgrade to a newer kernel version."
    fi
-    echo " see http://docs.scylladb.com/kb/kb-fs-not-qualified-aio/ for details"
+    echo " see http://www.scylladb.com/kb/kb-fs-not-qualified-aio/ for details"
 fi
 exit $RET
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -76,8 +76,16 @@ verify_package() {
    fi
 }

+list_block_devices() {
+    if lsblk --help | grep -q -e -p; then
+        lsblk -pnr | awk '{ print $1 }'
+    else
+        ls -1 /dev/sd* /dev/hd* /dev/xvd* /dev/nvme* /dev/mapper/*  2>/dev/null|grep -v control
+    fi
+}
+
 get_unused_disks() {
-    blkid -c /dev/null|cut -f 1 -d ' '|sed s/://g|grep -v loop|while read dev
+    list_block_devices|grep -v loop|while read dev
    do
        count_raw=$(grep $dev /proc/mounts|wc -l)
        count_pvs=0
--- a/dist/common/systemd/scylla-housekeeping.timer
+++ b/dist/common/systemd/scylla-housekeeping.timer
@@ -4,7 +4,8 @@ After=scylla-server.service
 BindsTo=scylla-server.service

 [Timer]
-OnBootSec=0
+# set OnActiveSec to 3 to safely avoid issues/1846
+OnActiveSec=3
 OnUnitActiveSec=1d

 [Install]
--- a/dist/common/systemd/scylla-server.service.in
+++ b/dist/common/systemd/scylla-server.service.in
@@ -8,7 +8,7 @@ Wants=scylla-housekeeping.timer
 PermissionsStartOnly=true
 Type=notify
 LimitMEMLOCK=infinity
-LimitNOFILE=200000
+LimitNOFILE=800000
 LimitAS=infinity
 LimitNPROC=8096
 EnvironmentFile=@@SYSCONFDIR@@/scylla-server
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -7,7 +7,7 @@ ENV container docker
 VOLUME [ "/sys/fs/cgroup" ]

 #install scylla
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-1.5.repo -o /etc/yum.repos.d/scylla.repo
 RUN yum -y install epel-release
 RUN yum -y clean expire-cache
 RUN yum -y update
--- a/dist/redhat/centos_dep/build_dependency.sh
+++ b/dist/redhat/centos_dep/build_dependency.sh
@@ -28,10 +28,6 @@ if [ ! -f boost-1.58.0-11.fc23.src.rpm ]; then
    wget https://kojipkgs.fedoraproject.org//packages/boost/1.58.0/11.fc23/src/boost-1.58.0-11.fc23.src.rpm
 fi

-if [ ! -f ninja-build-1.6.0-2.fc23.src.rpm ]; then
-    wget https://kojipkgs.fedoraproject.org//packages/ninja-build/1.6.0/2.fc23/src/ninja-build-1.6.0-2.fc23.src.rpm
-fi
-
 if [ ! -f ragel-6.8-5.fc23.src.rpm ]; then
   wget https://kojipkgs.fedoraproject.org//packages/ragel/6.8/5.fc23/src/ragel-6.8-5.fc23.src.rpm
 fi
@@ -94,13 +90,6 @@ if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-boost-1.58.0-11.el7.centos.x86_64.rpm ];
 fi
 do_install scylla-boost*

-if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-ninja-build-1.6.0-2.el7.centos.x86_64.rpm ]; then
-   rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ninja-build-1.6.0-2.fc23.src.rpm
-   patch $RPMBUILD/SPECS/ninja-build.spec < dist/redhat/centos_dep/ninja-build.diff
-   rpmbuild --define "_topdir $RPMBUILD" -ba $RPMBUILD/SPECS/ninja-build.spec
-fi
-do_install scylla-ninja-build-1.6.0-2.el7.centos.x86_64.rpm
-
 if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-ragel-6.8-5.el7.centos.x86_64.rpm ]; then
    rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ragel-6.8-5.fc23.src.rpm
    patch $RPMBUILD/SPECS/ragel.spec < dist/redhat/centos_dep/ragel.diff
--- a/dist/redhat/centos_dep/ninja-build.diff
+++ b/dist/redhat/centos_dep/ninja-build.diff
@@ -1,56 +0,0 @@
--- ninja-build.spec.orig	2016-01-20 14:41:16.892802134 +0000
-+++ ninja-build.spec	2016-01-20 14:44:42.453227192 +0000
-@@ -1,19 +1,18 @@
-Name:           ninja-build
-+Name:           scylla-ninja-build
- Version:        1.6.0
- Release:        2%{?dist}
- Summary:        A small build system with a focus on speed
- License:        ASL 2.0
- URL:            http://martine.github.com/ninja/
- Source0:        https://github.com/martine/ninja/archive/v%{version}.tar.gz#/ninja-%{version}.tar.gz
-Source1:        ninja.vim
- # Rename mentions of the executable name to be ninja-build.
- Patch1000:      ninja-1.6.0-binary-rename.patch
-+Requires:	scylla-env
- BuildRequires:  asciidoc
- BuildRequires:  gtest-devel
- BuildRequires:  python2-devel
-BuildRequires:  re2c >= 0.11.3
-Requires:       emacs-filesystem
-Requires:       vim-filesystem
-+#BuildRequires:  scylla-re2c >= 0.11.3
-+%define _prefix /opt/scylladb
- 
- %description
- Ninja is a small build system with a focus on speed. It differs from other
-@@ -32,15 +31,8 @@
- ./ninja -v ninja_test
- 
- %install
-# TODO: Install ninja_syntax.py?
-mkdir -p %{buildroot}/{%{_bindir},%{_datadir}/bash-completion/completions,%{_datadir}/emacs/site-lisp,%{_datadir}/vim/vimfiles/syntax,%{_datadir}/vim/vimfiles/ftdetect,%{_datadir}/zsh/site-functions}
-
-+mkdir -p %{buildroot}/opt/scylladb/bin
- install -pm755 ninja %{buildroot}%{_bindir}/ninja-build
-install -pm644 misc/bash-completion %{buildroot}%{_datadir}/bash-completion/completions/ninja-bash-completion
-install -pm644 misc/ninja-mode.el %{buildroot}%{_datadir}/emacs/site-lisp/ninja-mode.el
-install -pm644 misc/ninja.vim %{buildroot}%{_datadir}/vim/vimfiles/syntax/ninja.vim
-install -pm644 %{SOURCE1} %{buildroot}%{_datadir}/vim/vimfiles/ftdetect/ninja.vim
-install -pm644 misc/zsh-completion %{buildroot}%{_datadir}/zsh/site-functions/_ninja
- 
- %check
- # workaround possible too low default limits
-@@ -50,12 +42,6 @@
- %files
- %doc COPYING HACKING.md README doc/manual.html
- %{_bindir}/ninja-build
-%{_datadir}/bash-completion/completions/ninja-bash-completion
-%{_datadir}/emacs/site-lisp/ninja-mode.el
-%{_datadir}/vim/vimfiles/syntax/ninja.vim
-%{_datadir}/vim/vimfiles/ftdetect/ninja.vim
-# zsh does not have a -filesystem package
-%{_datadir}/zsh/
- 
- %changelog
- * Mon Nov 16 2015 Ben Boeckel <mathstuf@gmail.com> - 1.6.0-2
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -27,10 +27,10 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel openssl-devel libcap-devel libselinux-devel libgcrypt-devel libgpg-error-devel elfutils-devel krb5-devel libcom_err-devel libattr-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler libunwind-devel
-%{?fedora:BuildRequires: boost-devel ninja-build ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
-%{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-ninja-build scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
-Requires:       scylla-conf systemd-libs hwloc collectd PyYAML python-urwid pciutils pyparsing python-requests curl bc util-linux
+BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel openssl-devel libcap-devel libselinux-devel libgcrypt-devel libgpg-error-devel elfutils-devel krb5-devel libcom_err-devel libattr-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler libunwind-devel ninja-build
+%{?fedora:BuildRequires: boost-devel ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
+%{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
+Requires:       scylla-conf systemd-libs hwloc collectd PyYAML python-urwid pciutils pyparsing python-requests curl bc util-linux python-setuptools
 %{?rhel:Requires: python34 python34-PyYAML}
 Conflicts:      abrt

@@ -110,8 +110,8 @@ cp -r scylla-housekeeping $RPM_BUILD_ROOT%{_prefix}/lib/scylla/scylla-housekeepi
 cp -P dist/common/sbin/*  $RPM_BUILD_ROOT%{_sbindir}/

 %pre server
-/usr/sbin/groupadd scylla 2> /dev/null || :
-/usr/sbin/useradd -g scylla -s /sbin/nologin -r -d %{_sharedstatedir}/scylla scylla 2> /dev/null || :
+getent group scylla || /usr/sbin/groupadd scylla 2> /dev/null || :
+getent passwd scylla || /usr/sbin/useradd -g scylla -s /sbin/nologin -r -d %{_sharedstatedir}/scylla scylla 2> /dev/null || :

 %post server
 # Upgrade coredump settings
--- a/dist/ubuntu/debian/scylla-server.upstart
+++ b/dist/ubuntu/debian/scylla-server.upstart
@@ -20,7 +20,7 @@ setuid scylla
 setgid scylla
 limit core unlimited unlimited
 limit memlock unlimited unlimited
-limit nofile 200000 200000
+limit nofile 800000 800000
 limit as unlimited unlimited
 limit nproc 8096 8096
 chdir /var/lib/scylla
--- a/gc_clock.hh
+++ b/gc_clock.hh
@@ -50,6 +50,12 @@ public:
    // for real time waits.
 };

+// Returns a time point which is earlier from t by d, or minimum time point if it cannot be represented.
+template<typename Clock, typename Duration, typename Rep, typename Period>
+inline
+auto saturating_subtract(std::chrono::time_point<Clock, Duration> t, std::chrono::duration<Rep, Period> d) -> decltype(t) {
+    return std::max(t, decltype(t)::min() + d) - d;
+}

 using expiry_opt = std::experimental::optional<gc_clock::time_point>;
 using ttl_opt = std::experimental::optional<gc_clock::duration>;
--- a/main.cc
+++ b/main.cc
@@ -184,8 +184,8 @@ public:
                        throw;
                    }
                });
-            } catch (std::system_error& e) {
-                startlog.error("Directory '{}' not found. Tried to created it but failed: {}", path, e.what());
+            } catch (...) {
+                startlog.error("Directory '{}' cannot be initialized. Tried to do it but failed with: {}", path, std::current_exception());
                throw;
            }
        });
@@ -410,6 +410,8 @@ int main(int ac, char** av) {
            if (opts.count("developer-mode")) {
                smp::invoke_on_all([] { engine().set_strict_dma(false); }).get();
            }
+            supervisor_notify("creating tracing");
+            tracing::tracing::create_tracing("trace_keyspace_helper").get();
            supervisor_notify("creating snitch");
            i_endpoint_snitch::create_snitch(cfg->endpoint_snitch()).get();
            // #293 - do not stop anything
@@ -466,6 +468,10 @@ int main(int ac, char** av) {
            // that can happen, making existing problems worse. So running a single shard first
            // and getting making sure that all temporary tables are deleted provides extra
            // protection against such situations.
+            //
+            // We also need to init commitlog on shard0 before it is inited on other shards
+            // because it obtains the list of pre-existing segments for replay, which must
+            // not include reserve segments created by active commitlogs.
            db.invoke_on(0, [] (database& db) { return db.init_system_keyspace(); }).get();
            db.invoke_on_all([] (database& db) {
                if (engine().cpu_id() == 0) {
@@ -538,7 +544,7 @@ int main(int ac, char** av) {
            supervisor_notify("starting commit log");
            auto cl = db.local().commitlog();
            if (cl != nullptr) {
-                auto paths = cl->list_existing_segments().get0();
+                auto paths = cl->get_segments_to_replay();
                if (!paths.empty()) {
                    supervisor_notify("replaying commit log");
                    auto rp = db::commitlog_replayer::create_replayer(qp).get0();
@@ -560,7 +566,12 @@ int main(int ac, char** av) {
            // we will have races between the compaction and loading processes
            // We also want to trigger regular compaction on boot.
            db.invoke_on_all([&proxy] (database& db) {
-                for (auto& x : db.get_column_families()) {
+                // avoid excessive disk usage by making sure all shards reshard
+                // shared sstables in the same order. That's done by choosing
+                // column families in UUID order, and each individual column
+                // family will reshard shared sstables in generation order.
+                auto cfs = boost::copy_range<std::map<utils::UUID, lw_shared_ptr<column_family>>>(db.get_column_families());
+                for (auto& x : cfs) {
                    column_family& cf = *(x.second);
                    // We start the rewrite, but do not wait for it.
                    cf.start_rewrite();
@@ -612,7 +623,7 @@ int main(int ac, char** av) {
            gms::get_local_gossiper().wait_for_gossip_to_settle().get();
            api::set_server_gossip_settle(ctx).get();
            supervisor_notify("starting tracing");
-            tracing::tracing::create_tracing("trace_keyspace_helper").get();
+            tracing::tracing::start_tracing().get();
            supervisor_notify("starting size estimates recorder");
            auto&& recorder = db::get_size_estimates_recorder();
            recorder.start().get();
--- a/memtable.cc
+++ b/memtable.cc
@@ -26,8 +26,16 @@

 namespace stdx = std::experimental;

-memtable::memtable(schema_ptr schema, logalloc::region_group* dirty_memory_region_group)
+memtable::memtable(schema_ptr schema, memtable_list* memtable_list)
+        : logalloc::region(memtable_list ? logalloc::region(memtable_list->region_group()) : logalloc::region())
+        , _memtable_list(memtable_list)
+        , _schema(std::move(schema))
+        , partitions(memtable_entry::compare(_schema)) {
+}
+
+memtable::memtable(schema_ptr schema, logalloc::region_group *dirty_memory_region_group)
        : logalloc::region(dirty_memory_region_group ? logalloc::region(*dirty_memory_region_group) : logalloc::region())
+        , _memtable_list(nullptr)
        , _schema(std::move(schema))
        , partitions(memtable_entry::compare(_schema)) {
 }
@@ -154,19 +162,23 @@ protected:
        , _range(&range)
    { }

-    memtable_entry* fetch_next_entry() {
+    memtable_entry* fetch_entry() {
        update_iterators();
        if (_i == _end) {
            return nullptr;
        } else {
            memtable_entry& e = *_i;
-            ++_i;
-            _last = e.key();
            _memtable->upgrade_entry(e);
            return &e;
        }
    }

+    void advance() {
+        memtable_entry& e = *_i;
+        _last = e.key();
+        ++_i;
+    }
+
    logalloc::allocating_section& read_section() {
        return _memtable->_read_section;
    }
@@ -236,14 +248,18 @@ public:
            return _delegate();
        }

-        logalloc::reclaim_lock _(region());
-        managed_bytes::linearization_context_guard lcg;
-        memtable_entry* e = fetch_next_entry();
-        if (!e) {
-             return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
-        } else {
-            return make_ready_future<streamed_mutation_opt>(e->read(mtbl(), schema(), _slice));
-        }
+        return read_section()(region(), [&] {
+            return with_linearized_managed_bytes([&] {
+                memtable_entry* e = fetch_entry();
+                if (!e) {
+                    return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
+                } else {
+                    auto ret =  make_ready_future<streamed_mutation_opt>(e->read(mtbl(), schema(), _slice));
+                    advance();
+                    return ret;
+                }
+            });
+        });
    }
 };

@@ -254,7 +270,7 @@ class flush_memory_accounter {
 public:
    void update_bytes_read(uint64_t delta) {
        _bytes_read += delta;
-        dirty_memory_manager::from_region_group(_region.group()).account_potentially_cleaned_up_memory(delta);
+        dirty_memory_manager::from_region_group(_region.group()).account_potentially_cleaned_up_memory(&_region, delta);
    }

    explicit flush_memory_accounter(logalloc::region& region)
@@ -263,7 +279,7 @@ public:

    ~flush_memory_accounter() {
        assert(_bytes_read <= _region.occupancy().used_space());
-        dirty_memory_manager::from_region_group(_region.group()).revert_potentially_cleaned_up_memory(_bytes_read);
+        dirty_memory_manager::from_region_group(_region.group()).revert_potentially_cleaned_up_memory(&_region, _bytes_read);
    }
    void account_component(memtable_entry& e) {
        auto delta = _region.allocator().object_memory_size_in_allocator(&e)
@@ -318,19 +334,24 @@ public:
    flush_reader& operator=(const flush_reader&) = delete;

    virtual future<streamed_mutation_opt> operator()() override {
-        logalloc::reclaim_lock _(region());
-        managed_bytes::linearization_context_guard lcg;
-        memtable_entry* e = fetch_next_entry();
-        if (!e) {
-            return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
-        } else {
-            auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), query::full_slice, e->key().key());
-            auto snp = e->partition().read(schema());
-            auto mpsr = make_partition_snapshot_reader<partition_snapshot_accounter>(schema(), e->key(), std::move(cr), snp, region(), read_section(), mtbl(), _flushed_memory);
-            _flushed_memory.account_component(*e);
-            _flushed_memory.account_component(*snp);
-            return make_ready_future<streamed_mutation_opt>(std::move(mpsr));
-        }
+        return read_section()(region(), [&] {
+            return with_linearized_managed_bytes([&] {
+                memtable_entry* e = fetch_entry();
+                if (!e) {
+                    return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
+                } else {
+                    auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), query::full_slice, e->key().key());
+                    auto snp = e->partition().read(schema());
+                    auto mpsr = make_partition_snapshot_reader<partition_snapshot_accounter>(schema(), e->key(), std::move(cr),
+                            snp, region(), read_section(), mtbl(), _flushed_memory);
+                    _flushed_memory.account_component(*e);
+                    _flushed_memory.account_component(*snp);
+                    auto ret = make_ready_future<streamed_mutation_opt>(std::move(mpsr));
+                    advance();
+                    return ret;
+                }
+            });
+        });
    }
 };

--- a/memtable.hh
+++ b/memtable.hh
@@ -101,6 +101,7 @@ public:
        bi::member_hook<memtable_entry, bi::set_member_hook<>, &memtable_entry::_link>,
        bi::compare<memtable_entry::compare>>;
 private:
+    memtable_list *_memtable_list;
    schema_ptr _schema;
    logalloc::allocating_section _read_section;
    logalloc::allocating_section _allocating_section;
@@ -116,7 +117,9 @@ private:
    partition_entry& find_or_create_partition_slow(partition_key_view key);
    void upgrade_entry(memtable_entry&);
 public:
-    explicit memtable(schema_ptr schema, logalloc::region_group* dirty_memory_region_group = nullptr);
+    explicit memtable(schema_ptr schema, memtable_list *memtable_list);
+    // Used for testing that want to control the flush process.
+    explicit memtable(schema_ptr schema, logalloc::region_group *dirty_memrory_region= nullptr);
    ~memtable();
    schema_ptr schema() const { return _schema; }
    void set_schema(schema_ptr) noexcept;
@@ -134,7 +137,15 @@ public:
    const logalloc::region& region() const {
        return *this;
    }
+
+    logalloc::region_group* region_group() {
+        return group();
+    }
 public:
+    memtable_list* get_memtable_list() {
+        return _memtable_list;
+    }
+
    size_t partition_count() const;
    logalloc::occupancy_stats occupancy() const;

--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -123,7 +123,7 @@ public:
              uint32_t partition_limit, CompactedMutationsConsumer consumer)
        : _schema(s)
        , _query_time(query_time)
-        , _gc_before(query_time - s.gc_grace_seconds())
+        , _gc_before(saturating_subtract(query_time, s.gc_grace_seconds()))
        , _can_gc(always_gc)
        , _slice(slice)
        , _row_limit(limit)
@@ -139,7 +139,7 @@ public:
                     std::function<api::timestamp_type(const dht::decorated_key&)> get_max_purgeable)
        : _schema(s)
        , _query_time(compaction_time)
-        , _gc_before(_query_time - s.gc_grace_seconds())
+        , _gc_before(saturating_subtract(_query_time, s.gc_grace_seconds()))
        , _get_max_purgeable(std::move(get_max_purgeable))
        , _can_gc([this] (tombstone t) { return can_gc(t); })
        , _slice(query::full_slice)
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -1181,7 +1181,7 @@ uint32_t mutation_partition::do_compact(const schema& s,
 {
    assert(row_limit > 0);

-    auto gc_before = query_time - s.gc_grace_seconds();
+    auto gc_before = saturating_subtract(query_time, s.gc_grace_seconds());

    auto should_purge_tombstone = [&] (const tombstone& t) {
        return t.deletion_time < gc_before && can_gc(t);
--- a/partition_version.hh
+++ b/partition_version.hh
@@ -474,9 +474,9 @@ public:
                try {
                    _read_section(_lsa_region, [this] {
                        _snapshot->merge_partition_versions();
-                        _snapshot = {};
                    });
                } catch (...) { }
+                _snapshot = {};
            });
        });
    }
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -444,6 +444,13 @@ static void split_and_add(std::vector<::range<dht::token>>& ranges,
    auto midpoint = dht::global_partitioner().midpoint(
            range.start() ? range.start()->value() : dht::minimum_token(),
            range.end() ? range.end()->value() : dht::minimum_token());
+    // This shouldn't happen, but if the range included just one token, we
+    // can't split further (split() may actually fail with assertion failure)
+    if ((range.start() && midpoint == range.start()->value()) ||
+        (range.end() && midpoint == range.end()->value())) {
+        ranges.push_back(range);
+        return;
+    }
    auto halves = range.split(midpoint, dht::token_comparator());
    ranges.push_back(halves.first);
    ranges.push_back(halves.second);
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -74,8 +74,7 @@ cache_tracker::cache_tracker() {
            }
            cache_entry& ce = _lru.back();
            auto it = row_cache::partitions_type::s_iterator_to(ce);
-            --it;
-            clear_continuity(*it);
+            clear_continuity(*std::next(it));
            _lru.pop_back_and_dispose(current_deleter<cache_entry>());
            --_partitions;
            ++_evictions;
@@ -365,6 +364,7 @@ public:
            ++_it;
            _last = ce.key();
            _cache.upgrade_entry(ce);
+            _cache._tracker.touch(ce);
            _cache.on_hit();
            cache_data cd { { }, ce.continuous() };
            if (ce.wide_partition()) {
@@ -546,7 +546,6 @@ private:
        if (!_first_element) {
            return false;
        }
-        _first_element = false;
        return _pr.start() && _pr.start()->is_inclusive() && _pr.start()->value().equal(*_schema, dk);
    }

@@ -554,6 +553,7 @@ private:
        return _primary_reader().then([this] (just_cache_scanning_reader::cache_data cd) {
            auto& smopt = cd.mut;
            if (cd.continuous || (smopt && is_inclusive_start_bound(smopt->decorated_key()))) {
+                _first_element = false;
                update_last_key(smopt);
                return make_ready_future<streamed_mutation_opt>(std::move(smopt));
            } else {
@@ -682,7 +682,9 @@ row_cache::make_reader(schema_ptr s,
 row_cache::~row_cache() {
    with_allocator(_tracker.allocator(), [this] {
        _partitions.clear_and_dispose([this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
-            _tracker.on_erase();
+            if (!p->is_dummy_entry()) {
+                _tracker.on_erase();
+            }
            deleter(p);
        });
    });
@@ -720,7 +722,9 @@ void row_cache::do_find_or_create_entry(const dht::decorated_key& key,
                    return;
                }

-                if ((!previous->_key && i == _partitions.begin()) || (previous->_key && std::prev(i)->key().equal(*_schema, *previous->_key))) {
+                if ((!previous->_key && i == _partitions.begin())
+                    || (previous->_key && i != _partitions.begin()
+                        && std::prev(i)->key().equal(*_schema, *previous->_key))) {
                    i->set_continuous(true);
                }
            });
--- a/scripts/scylla_install_pkg
+++ b/scripts/scylla_install_pkg
@@ -10,13 +10,16 @@ fi
 print_usage() {
    echo "scylla_install_pkg --local-pkg /home/scylla/rpms --repo [URL]"
    echo "  --local-pkg	install locally built .rpm/.deb on specified directory"
-    echo "  --repo specify .repo/.list file URL"
+    echo "  --repo  repository for both install and update, specify .repo/.list file URL"
+    echo "  --repo-for-install  repository for install, specify .repo/.list file URL"
+    echo "  --repo-for-update  repository for update, specify .repo/.list file URL"
    exit 1
 }

 LOCAL_PKG=
 UNSTABLE=0
-REPO=
+REPO_FOR_INSTALL=
+REPO_FOR_UPDATE=
 while [ $# -gt 0 ]; do
    case "$1" in
        "--local-pkg")
@@ -24,7 +27,16 @@ while [ $# -gt 0 ]; do
            shift 2
            ;;
        "--repo")
-            REPO=$2
+            REPO_FOR_INSTALL=$2
+            REPO_FOR_UPDATE=$2
+            shift 2
+            ;;
+        "--repo-for-install")
+            REPO_FOR_INSTALL=$2
+            shift 2
+            ;;
+        "--repo-for-update")
+            REPO_FOR_UPDATE=$2
            shift 2
            ;;
        *)
@@ -42,8 +54,8 @@ if [ "$ID" = "ubuntu" ]; then
    chmod +x /usr/sbin/policy-rc.d
    cp /etc/hosts /etc/hosts.orig
    echo 127.0.0.1 `hostname` >> /etc/hosts
-    if [ "$REPO" != "" ]; then
-        curl -o /etc/apt/sources.list.d/scylla.list $REPO
+    if [ "$REPO_FOR_INSTALL" != "" ]; then
+        curl -o /etc/apt/sources.list.d/scylla_install.list $REPO_FOR_INSTALL
    fi
    apt-get update
    if [ "$LOCAL_PKG" = "" ]; then
@@ -62,9 +74,14 @@ if [ "$ID" = "ubuntu" ]; then
    fi
    mv /etc/hosts.orig /etc/hosts
    rm /usr/sbin/policy-rc.d
+    rm /etc/apt/sources.list.d/scylla_install.list
+    if [ "$REPO_FOR_UPDATE" != "" ]; then
+        curl -o /etc/apt/sources.list.d/scylla.list $REPO_FOR_UPDATE
+    fi
+    apt-get update
 else
-    if [ "$REPO" != "" ]; then
-        curl -o /etc/yum.repos.d/scylla.repo $REPO
+    if [ "$REPO_FOR_INSTALL" != "" ]; then
+        curl -o /etc/yum.repos.d/scylla_install.repo $REPO_FOR_INSTALL
    fi

    if [ "$ID" = "centos" ]; then
@@ -81,4 +98,9 @@ else
    else
        yum install -y $LOCAL_PKG/scylla*.*.rpm
    fi
+
+    rm /etc/yum.repos.d/scylla_install.repo
+    if [ "$REPO_FOR_UPDATE" != "" ]; then
+        curl -o /etc/yum.repos.d/scylla.repo $REPO_FOR_UPDATE
+    fi
 fi
--- a/2
+++ b/2
--- a/service/pager/query_pagers.cc
+++ b/service/pager/query_pagers.cc
@@ -54,7 +54,7 @@ public:
                    const cql3::query_options& options,
                    lw_shared_ptr<query::read_command> cmd,
                    std::vector<query::partition_range> ranges)
-                    : _has_clustering_keys(s->clustering_key_size() > 0)
+                    : _has_clustering_keys(has_clustering_keys(*s, *cmd))
                    , _max(cmd->row_limit)
                    , _schema(std::move(s))
                    , _selection(selection)
@@ -65,6 +65,11 @@ public:
    {}

 private:   
+    static bool has_clustering_keys(const schema& s, const query::read_command& cmd) {
+        return s.clustering_key_size() > 0
+               && !cmd.slice.options.contains<query::partition_slice::option::distinct>();
+    }
+
    future<> fetch_page(cql3::selection::result_set_builder& builder, uint32_t page_size, db_clock::time_point now) override {
        auto state = _options.get_paging_state();

--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -2643,12 +2643,19 @@ storage_proxy::query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
        }
    }

+    // estimate_result_rows_per_range() is currently broken, and this is not needed
+    // when paging is available in any case
+#if 0
    // our estimate of how many result rows there will be per-range
    float result_rows_per_range = estimate_result_rows_per_range(cmd, ks);
    // underestimate how many rows we will get per-range in order to increase the likelihood that we'll
    // fetch enough rows in the first round
    result_rows_per_range -= result_rows_per_range * CONCURRENT_SUBREQUESTS_MARGIN;
    int concurrency_factor = result_rows_per_range == 0.0 ? 1 : std::max(1, std::min(int(ranges.size()), int(std::ceil(cmd->row_limit / result_rows_per_range))));
+#else
+    int result_rows_per_range = 0;
+    int concurrency_factor = 1;
+#endif

    std::vector<foreign_ptr<lw_shared_ptr<query::result>>> results;
    results.reserve(ranges.size()/concurrency_factor + 1);
--- a/sstables/atomic_deletion.cc
+++ b/sstables/atomic_deletion.cc
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "atomic_deletion.hh"
+#include "to_string.hh"
+#include <seastar/core/shared_future.hh>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/copy.hpp>
+
+namespace sstables {
+
+atomic_deletion_manager::atomic_deletion_manager(unsigned shard_count,
+        std::function<future<> (std::vector<sstring> sstables)> delete_sstables)
+        : _shard_count(shard_count)
+        , _delete_sstables(std::move(delete_sstables)) {
+}
+
+future<>
+atomic_deletion_manager::delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set, unsigned deleting_shard) {
+    // runs on shard 0 only
+    _deletion_logger.debug("shard {} atomically deleting {}", deleting_shard, atomic_deletion_set);
+
+    if (_atomic_deletions_cancelled) {
+        _deletion_logger.debug("atomic deletions disabled, erroring out");
+        using boost::adaptors::transformed;
+        throw atomic_deletion_cancelled(atomic_deletion_set
+                                        | transformed(std::mem_fn(&sstable_to_delete::name)));
+    }
+
+    // Insert atomic_deletion_set into the list of sets pending deletion.  If the new set
+    // overlaps with an existing set, merge them (the merged set will be deleted atomically).
+    std::unordered_map<sstring, lw_shared_ptr<pending_deletion>> new_atomic_deletion_sets;
+    auto merged_set = make_lw_shared(pending_deletion());
+    for (auto&& sst_to_delete : atomic_deletion_set) {
+        merged_set->names.insert(sst_to_delete.name);
+        if (!sst_to_delete.shared) {
+            for (auto shard : boost::irange<shard_id>(0, _shard_count)) {
+                _shards_agreeing_to_delete_sstable[sst_to_delete.name].insert(shard);
+            }
+        }
+        new_atomic_deletion_sets.emplace(sst_to_delete.name, merged_set);
+    }
+    auto pr = make_lw_shared<promise<>>();
+    merged_set->completions.insert(pr);
+    auto ret = pr->get_future();
+    for (auto&& sst_to_delete : atomic_deletion_set) {
+        auto i = _atomic_deletion_sets.find(sst_to_delete.name);
+        // merge from old deletion set to new deletion set
+        // i->second can be nullptr, see below why
+        if (i != _atomic_deletion_sets.end() && i->second) {
+            boost::copy(i->second->names, std::inserter(merged_set->names, merged_set->names.end()));
+            boost::copy(i->second->completions, std::inserter(merged_set->completions, merged_set->completions.end()));
+        }
+    }
+    _deletion_logger.debug("new atomic set: {}", merged_set->names);
+    // we need to merge new_atomic_deletion_sets into g_atomic_deletion_sets,
+    // but beware of exceptions.  We do that with a first pass that inserts
+    // nullptr as the value, so the second pass only replaces, and does not allocate
+    for (auto&& sst_to_delete : atomic_deletion_set) {
+        _atomic_deletion_sets.emplace(sst_to_delete.name, nullptr);
+    }
+    // now, no allocations are involved, so this commits the operation atomically
+    for (auto&& n : merged_set->names) {
+        auto i = _atomic_deletion_sets.find(n);
+        i->second = merged_set;
+    }
+
+    // Mark each sstable as being deleted from deleting_shard.  We have to do
+    // this in a separate pass, so the consideration whether we can delete or not
+    // sees all the data from this pass.
+    for (auto&& sst : atomic_deletion_set) {
+        _shards_agreeing_to_delete_sstable[sst.name].insert(deleting_shard);
+    }
+
+    // Figure out if the (possibly merged) set can be deleted
+    for (auto&& sst : merged_set->names) {
+        if (_shards_agreeing_to_delete_sstable[sst].size() != _shard_count) {
+            // Not everyone agrees, leave the set pending
+            _deletion_logger.debug("deferring deletion until all shards agree");
+            return ret;
+        }
+    }
+
+    // Cannot recover from a failed deletion
+    for (auto&& name : merged_set->names) {
+        _atomic_deletion_sets.erase(name);
+        _shards_agreeing_to_delete_sstable.erase(name);
+    }
+
+    // Everyone agrees, let's delete
+    auto names = boost::copy_range<std::vector<sstring>>(merged_set->names);
+    _deletion_logger.debug("deleting {}", names);
+    return _delete_sstables(names).then_wrapped([this, merged_set] (future<> result) {
+        _deletion_logger.debug("atomic deletion completed: {}", merged_set->names);
+        shared_future<> sf(std::move(result));
+        for (auto&& comp : merged_set->completions) {
+            sf.get_future().forward_to(std::move(*comp));
+        }
+    });
+
+    return ret;
+}
+
+void
+atomic_deletion_manager::cancel_atomic_deletions() {
+    _atomic_deletions_cancelled = true;
+    for (auto&& pd : _atomic_deletion_sets) {
+        if (!pd.second) {
+            // Could happen if a delete_atomically() failed
+            continue;
+        }
+        for (auto&& c : pd.second->completions) {
+            c->set_exception(atomic_deletion_cancelled(pd.second->names));
+        }
+        // since sets are shared, make sure we don't hit the same one again
+        pd.second->completions.clear();
+    }
+    _atomic_deletion_sets.clear();
+    _shards_agreeing_to_delete_sstable.clear();
+}
+
+}
--- a/sstables/atomic_deletion.hh
+++ b/sstables/atomic_deletion.hh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+// The atomic deletion manager solves the problem of orchestrating
+// the deletion of files that must be deleted as a group, where each
+// shard has different groups, and all shards delete a file for it to
+// be deleted.  For example,
+//
+//  shard 0: delete "A"
+//     we can't delete anything because shard 1 hasn't agreed yet.
+//  shard 1: delete "A" and B"
+//     shard 1 agrees to delete "A", but we can't delete it yet,
+//     because shard 1 requires that it be deleted together with "B",
+//     and shard 0 hasn't agreed to delete "B" yet.
+//  shard 0: delete "B" and "C"
+//     shards 0 and 1 now both agree to delete "A" and "B", but shard 0
+//     doesn't allow us to delete "B" without "C".
+//  shard 1: delete "C"
+//     finally, we can delete "A", "B", and "C".
+
+#include "log.hh"
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/reactor.hh> // for shard_id
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+
+namespace sstables {
+
+struct sstable_to_delete {
+    sstable_to_delete(sstring name, bool shared) : name(std::move(name)), shared(shared) {}
+    sstring name;
+    bool shared = false;
+    friend std::ostream& operator<<(std::ostream& os, const sstable_to_delete& std);
+};
+
+class atomic_deletion_cancelled : public std::exception {
+    std::string _msg;
+public:
+    explicit atomic_deletion_cancelled(std::vector<sstring> names);
+    template <typename StringRange>
+    explicit atomic_deletion_cancelled(StringRange range)
+            : atomic_deletion_cancelled(std::vector<sstring>{range.begin(), range.end()}) {
+    }
+    const char* what() const noexcept override;
+};
+
+class atomic_deletion_manager {
+    logging::logger _deletion_logger{"sstable-deletion"};
+    using shards_agreeing_to_delete_sstable_type = std::unordered_set<shard_id>;
+    using sstables_to_delete_atomically_type = std::set<sstring>;
+    struct pending_deletion {
+        sstables_to_delete_atomically_type names;
+        std::unordered_set<lw_shared_ptr<promise<>>> completions;
+    };
+    bool _atomic_deletions_cancelled = false;
+    // map from sstable name to a set of sstables that must be deleted atomically, including itself
+    std::unordered_map<sstring, lw_shared_ptr<pending_deletion>> _atomic_deletion_sets;
+    std::unordered_map<sstring, shards_agreeing_to_delete_sstable_type> _shards_agreeing_to_delete_sstable;
+    unsigned _shard_count;
+    std::function<future<> (std::vector<sstring> sstables)> _delete_sstables;
+public:
+    atomic_deletion_manager(unsigned shard_count,
+            std::function<future<> (std::vector<sstring> sstables)> delete_sstables);
+    future<> delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set, unsigned deleting_shard);
+    void cancel_atomic_deletions();
+};
+
+}
--- a/sstables/compress.cc
+++ b/sstables/compress.cc
@@ -71,6 +71,12 @@ void compression::set_compressor(compressor c) {
     }
 }

+// locate() takes a byte position in the uncompressed stream, and finds the
+// the location of the compressed chunk on disk which contains it, and the
+// offset in this chunk.
+// locate() may only be used for offsets of actual bytes, and in particular
+// the end-of-file position (one past the last byte) MUST not be used. If the
+// caller wants to read from the end of file, it should simply read nothing.
 compression::chunk_and_offset
 compression::locate(uint64_t position) const {
    auto ucl = uncompressed_chunk_length();
@@ -121,7 +127,11 @@ size_t compress_lz4(const char* input, size_t input_len,
    output[1] = (input_len >> 8) & 0xFF;
    output[2] = (input_len >> 16) & 0xFF;
    output[3] = (input_len >> 24) & 0xFF;
+#ifdef HAVE_LZ4_COMPRESS_DEFAULT
+    auto ret = LZ4_compress_default(input, output + 4, input_len, LZ4_compressBound(input_len));
+#else
    auto ret = LZ4_compress(input, output + 4, input_len);
+#endif
    if (ret == 0) {
        throw std::runtime_error("LZ4 compression failure: LZ4_compress() failed");
    }
@@ -306,6 +316,9 @@ public:
    virtual future<temporary_buffer<char>> skip(uint64_t n) override {
        _pos += n;
        assert(_pos <= _end_pos);
+        if (_pos == _end_pos) {
+            return make_ready_future<temporary_buffer<char>>();
+        }
        auto addr = _compression_metadata->locate(_pos);
        auto underlying_n = addr.chunk_start - _underlying_pos;
        _underlying_pos = addr.chunk_start;
--- a/sstables/consumer.hh
+++ b/sstables/consumer.hh
@@ -305,7 +305,7 @@ public:
        _remain = end - _stream_position;

        _prestate = prestate::NONE;
-        state_processor().reset();
+        state_processor().reset(begin);
        return _input.skip(n);
    }

--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -38,7 +38,7 @@ public:
    bool should_continue() {
        return indexes.size() < max_quantity;
    }
-    void consume_entry(index_entry&& ie) {
+    void consume_entry(index_entry&& ie, uint64_t offset) {
        indexes.push_back(std::move(ie));
    }
    void reset() {
@@ -49,13 +49,14 @@ public:
 // IndexConsumer is a concept that implements:
 //
 // bool should_continue();
-// void consume_entry(index_entry&& ie);
+// void consume_entry(index_entry&& ie, uintt64_t offset);
 template <class IndexConsumer>
 class index_consume_entry_context: public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
    using proceed = data_consumer::proceed;
    using continuous_data_consumer = data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>>;
 private:
    IndexConsumer& _consumer;
+    uint64_t _entry_offset;

    enum class state {
        START,
@@ -113,9 +114,12 @@ public:
                _state = state::CONSUME_ENTRY;
                break;
            }
-        case state::CONSUME_ENTRY:
-            _consumer.consume_entry(index_entry(std::move(_key), this->_u64, std::move(_promoted)));
+        case state::CONSUME_ENTRY: {
+            auto len = (_key.size() + _promoted.size() + 14);
+            _consumer.consume_entry(index_entry(std::move(_key), this->_u64, std::move(_promoted)), _entry_offset);
+            _entry_offset += len;
            _state = state::START;
+        }
            break;
        default:
            throw malformed_sstable_exception("unknown state");
@@ -126,11 +130,12 @@ public:
    index_consume_entry_context(IndexConsumer& consumer,
            input_stream<char>&& input, uint64_t start, uint64_t maxlen)
        : continuous_data_consumer(std::move(input), start, maxlen)
-        , _consumer(consumer)
+        , _consumer(consumer), _entry_offset(start)
    {}

-    void reset() {
+    void reset(uint64_t offset) {
        _state = state::START;
+        _entry_offset = offset;
        _consumer.reset();
    }
 };
--- a/sstables/row.cc
+++ b/sstables/row.cc
@@ -374,7 +374,7 @@ public:
        }
    }

-    void reset() {
+    void reset(uint64_t offset) {
        _state = state::ROW_START;
        _consumer.reset();
    }
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -741,10 +741,10 @@ future<> sstable::read_toc() {
                    continue;
                }
                try {
-                   _components.insert(reverse_map(c, _component_map));
+                    _components.insert(reverse_map(c, _component_map));
                } catch (std::out_of_range& oor) {
-                    _components.clear(); // so subsequent read_toc will be forced to fail again
-                    throw malformed_sstable_exception("Unrecognized TOC component: " + c, file_path);
+                    _unrecognized_components.push_back(c);
+                    sstlog.info("Unrecognized TOC component was found: {} in sstable {}", c, file_path);
                }
            }
            if (!_components.size()) {
@@ -1598,7 +1598,7 @@ file_writer components_writer::index_file_writer(sstable& sst, const io_priority
    options.buffer_size = sst.sstable_buffer_size;
    options.io_priority_class = pc;
    options.write_behind = 10;
-    return file_writer(sst._index_file, std::move(options));
+    return file_writer(std::move(sst._index_file), std::move(options));
 }

 // Get the currently loaded configuration, or the default configuration in
@@ -1750,7 +1750,6 @@ void components_writer::consume_end_of_stream() {
    seal_summary(_sst._summary, std::move(_first_key), std::move(_last_key)); // what if there is only one partition? what if it is empty?

    _index.close().get();
-    _sst._index_file = file(); // index->close() closed _index_file

    if (_sst.has_component(sstable::component_type::CompressionInfo)) {
        _sst._collector.add_compression_ratio(_sst._compression.compressed_file_length(), _sst._compression.uncompressed_file_length());
@@ -1773,20 +1772,20 @@ void sstable_writer::prepare_file_writer()
    options.write_behind = 10;

    if (!_compression_enabled) {
-        _writer = make_shared<checksummed_file_writer>(_sst._data_file, std::move(options), true);
+        _writer = std::make_unique<checksummed_file_writer>(std::move(_sst._data_file), std::move(options), true);
    } else {
        prepare_compression(_sst._compression, _schema);
-        _writer = make_shared<file_writer>(make_compressed_file_output_stream(_sst._data_file, std::move(options), &_sst._compression));
+        _writer = std::make_unique<file_writer>(make_compressed_file_output_stream(std::move(_sst._data_file), std::move(options), &_sst._compression));
    }
 }

 void sstable_writer::finish_file_writer()
 {
-    _writer->close().get();
-    _sst._data_file = file(); // w->close() closed _data_file
+    auto writer = std::move(_writer);
+    writer->close().get();

    if (!_compression_enabled) {
-        auto chksum_wr = static_pointer_cast<checksummed_file_writer>(_writer);
+        auto chksum_wr = static_cast<checksummed_file_writer*>(writer.get());
        write_digest(_sst.filename(sstable::component_type::Digest), chksum_wr->full_checksum());
        write_crc(_sst.filename(sstable::component_type::CRC), chksum_wr->finalize_checksum());
    } else {
@@ -1794,6 +1793,16 @@ void sstable_writer::finish_file_writer()
    }
 }

+sstable_writer::~sstable_writer() {
+    if (_writer) {
+        try {
+            _writer->close().get();
+        } catch (...) {
+            sstlog.error("sstable_writer failed to close file: {}", std::current_exception());
+        }
+    }
+}
+
 sstable_writer::sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
                               uint64_t max_sstable_size, bool backup, bool leave_unsealed, const io_priority_class& pc)
    : _sst(sst)
@@ -1867,8 +1876,8 @@ future<> sstable::generate_summary(const io_priority_class& pc) {
        bool should_continue() {
            return true;
        }
-        void consume_entry(index_entry&& ie) {
-            maybe_add_summary_entry(_summary, ie.get_key_bytes(), ie.position());
+        void consume_entry(index_entry&& ie, uint64_t offset) {
+            maybe_add_summary_entry(_summary, ie.get_key_bytes(), offset);
            if (!first_key) {
                first_key = key(to_bytes(ie.get_key_bytes()));
            } else {
@@ -1957,6 +1966,28 @@ const sstring sstable::filename(sstring dir, sstring ks, sstring cf, version_typ
    return dir + "/" + strmap[version](entry_descriptor(ks, cf, version, generation, format, component));
 }

+const sstring sstable::filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation,
+                                format_types format, sstring component) {
+    static std::unordered_map<version_types, const char*, enum_hash<version_types>> fmtmap = {
+        { sstable::version_types::ka, "{0}-{1}-{2}-{3}-{5}" },
+        { sstable::version_types::la, "{2}-{3}-{4}-{5}" }
+    };
+
+    return dir + "/" + seastar::format(fmtmap[version], ks, cf, _version_string.at(version), to_sstring(generation), _format_string.at(format), component);
+}
+
+std::vector<std::pair<sstable::component_type, sstring>> sstable::all_components() const {
+    std::vector<std::pair<component_type, sstring>> all;
+    all.reserve(_components.size() + _unrecognized_components.size());
+    for (auto& c : _components) {
+        all.push_back(std::make_pair(c, _component_map.at(c)));
+    }
+    for (auto& c : _unrecognized_components) {
+        all.push_back(std::make_pair(component_type::Unknown, c));
+    }
+    return all;
+}
+
 future<> sstable::create_links(sstring dir, int64_t generation) const {
    // TemporaryTOC is always first, TOC is always last
    auto dst = sstable::filename(dir, _schema->ks_name(), _schema->cf_name(), _version, generation, _format, component_type::TemporaryTOC);
@@ -1964,12 +1995,13 @@ future<> sstable::create_links(sstring dir, int64_t generation) const {
        return sstable_write_io_check(sync_directory, dir);
    }).then([this, dir, generation] {
        // FIXME: Should clean already-created links if we failed midway.
-        return parallel_for_each(_components, [this, dir, generation] (auto comp) {
-            if (comp == component_type::TOC) {
+        return parallel_for_each(all_components(), [this, dir, generation] (auto p) {
+            if (p.first == component_type::TOC) {
                return make_ready_future<>();
            }
-            auto dst = sstable::filename(dir, _schema->ks_name(), _schema->cf_name(), _version, generation, _format, comp);
-            return sstable_write_io_check(::link_file, this->filename(comp), dst);
+            auto src = sstable::filename(_dir, _schema->ks_name(), _schema->cf_name(), _version, _generation, _format, p.second);
+            auto dst = sstable::filename(dir, _schema->ks_name(), _schema->cf_name(), _version, generation, _format, p.second);
+            return sstable_write_io_check(::link_file, std::move(src), std::move(dst));
        });
    }).then([dir] {
        return sstable_write_io_check(sync_directory, dir);
@@ -1989,11 +2021,11 @@ future<> sstable::set_generation(int64_t new_generation) {
        return remove_file(filename(component_type::TOC)).then([this] {
            return sstable_write_io_check(sync_directory, _dir);
        }).then([this] {
-            return parallel_for_each(_components, [this] (auto comp) {
-                if (comp == component_type::TOC) {
+            return parallel_for_each(all_components(), [this] (auto p) {
+                if (p.first == component_type::TOC) {
                    return make_ready_future<>();
                }
-                return remove_file(this->filename(comp));
+                return remove_file(sstable::filename(_dir, _schema->ks_name(), _schema->cf_name(), _version, _generation, _format, p.second));
            });
        });
    }).then([this, new_generation] {
@@ -2047,7 +2079,11 @@ sstable::format_types sstable::format_from_sstring(sstring &s) {
 }

 sstable::component_type sstable::component_from_sstring(sstring &s) {
-    return reverse_map(s, _component_map);
+    try {
+        return reverse_map(s, _component_map);
+    } catch (std::out_of_range&) {
+        return component_type::Unknown;
+    }
 }

 input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_priority_class& pc, lw_shared_ptr<file_input_stream_history> history) {
@@ -2240,8 +2276,11 @@ remove_by_toc_name(sstring sstable_toc_name) {
            dir = dirname(sstable_toc_name);
            sstable_write_io_check(rename_file, sstable_toc_name, new_toc_name).get();
            sstable_write_io_check(fsync_directory, dir).get();
-        } else {
+        } else if (sstable_write_io_check(file_exists, new_toc_name).get0()) {
            dir = dirname(new_toc_name);
+        } else {
+            sstlog.warn("Unable to delete {} because it doesn't exist.", sstable_toc_name);
+            return;
        }

        auto toc_file = open_checked_file_dma(sstable_read_error, new_toc_name, open_flags::ro).get0();
@@ -2427,107 +2466,21 @@ operator<<(std::ostream& os, const sstable_to_delete& std) {
    return os << std.name << "(" << (std.shared ? "shared" : "unshared") << ")";
 }

-using shards_agreeing_to_delete_sstable_type = std::unordered_set<shard_id>;
-using sstables_to_delete_atomically_type = std::set<sstring>;
-struct pending_deletion {
-    sstables_to_delete_atomically_type names;
-    std::vector<lw_shared_ptr<promise<>>> completions;
-};
-
-static thread_local bool g_atomic_deletions_cancelled = false;
-static thread_local std::list<lw_shared_ptr<pending_deletion>> g_atomic_deletion_sets;
-static thread_local std::unordered_map<sstring, shards_agreeing_to_delete_sstable_type> g_shards_agreeing_to_delete_sstable;
-
-static logging::logger deletion_logger("sstable-deletion");
-
-static
 future<>
-do_delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set, unsigned deleting_shard) {
-    // runs on shard 0 only
-    deletion_logger.debug("shard {} atomically deleting {}", deleting_shard, atomic_deletion_set);
-
-    if (g_atomic_deletions_cancelled) {
-        deletion_logger.debug("atomic deletions disabled, erroring out");
-        using boost::adaptors::transformed;
-        throw atomic_deletion_cancelled(atomic_deletion_set
-                                        | transformed(std::mem_fn(&sstable_to_delete::name)));
-    }
-
-    // Insert atomic_deletion_set into the list of sets pending deletion.  If the new set
-    // overlaps with an existing set, merge them (the merged set will be deleted atomically).
-    std::list<lw_shared_ptr<pending_deletion>> new_atomic_deletion_sets;
-    auto merged_set = make_lw_shared(pending_deletion());
-    for (auto&& sst_to_delete : atomic_deletion_set) {
-        merged_set->names.insert(sst_to_delete.name);
-        if (!sst_to_delete.shared) {
-            for (auto shard : boost::irange<shard_id>(0, smp::count)) {
-                g_shards_agreeing_to_delete_sstable[sst_to_delete.name].insert(shard);
-            }
-        }
-    }
-    merged_set->completions.push_back(make_lw_shared<promise<>>());
-    auto ret = merged_set->completions.back()->get_future();
-    for (auto&& old_set : g_atomic_deletion_sets) {
-         auto intersection = sstables_to_delete_atomically_type();
-         boost::set_intersection(merged_set->names, old_set->names, std::inserter(intersection, intersection.end()));
-         if (intersection.empty()) {
-             // We copy old_set to avoid corrupting g_atomic_deletion_sets if we fail
-             // further on.
-             new_atomic_deletion_sets.push_back(old_set);
-         } else {
-             deletion_logger.debug("merging with {}", old_set->names);
-             merged_set->names.insert(old_set->names.begin(), old_set->names.end());
-             boost::push_back(merged_set->completions, old_set->completions);
-         }
-    }
-    deletion_logger.debug("new atomic set: {}", merged_set->names);
-    new_atomic_deletion_sets.push_back(merged_set);
-    // can now exception-safely commit:
-    g_atomic_deletion_sets = std::move(new_atomic_deletion_sets);
-
-    // Mark each sstable as being deleted from deleting_shard.  We have to do
-    // this in a separate pass, so the consideration whether we can delete or not
-    // sees all the data from this pass.
-    for (auto&& sst : atomic_deletion_set) {
-        g_shards_agreeing_to_delete_sstable[sst.name].insert(deleting_shard);
-    }
-
-    // Figure out if the (possibly merged) set can be deleted
-    for (auto&& sst : merged_set->names) {
-        if (g_shards_agreeing_to_delete_sstable[sst].size() != smp::count) {
-            // Not everyone agrees, leave the set pending
-            deletion_logger.debug("deferring deletion until all shards agree");
-            return ret;
-        }
-    }
-
-    // Cannot recover from a failed deletion
-    g_atomic_deletion_sets.pop_back();
-    for (auto&& name : merged_set->names) {
-        g_shards_agreeing_to_delete_sstable.erase(name);
-    }
-
-    // Everyone agrees, let's delete
+delete_sstables(std::vector<sstring> tocs) {
    // FIXME: this needs to be done atomically (using a log file of sstables we intend to delete)
-    parallel_for_each(merged_set->names, [] (sstring name) {
-        deletion_logger.debug("deleting {}", name);
+    return parallel_for_each(tocs, [] (sstring name) {
        return remove_by_toc_name(name);
-    }).then_wrapped([merged_set] (future<> result) {
-        deletion_logger.debug("atomic deletion completed: {}", merged_set->names);
-        shared_future<> sf(std::move(result));
-        for (auto&& comp : merged_set->completions) {
-            sf.get_future().forward_to(std::move(*comp));
-        }
    });
-
-    return ret;
 }

+static thread_local atomic_deletion_manager g_atomic_deletion_manager(smp::count, delete_sstables);
+
 future<>
 delete_atomically(std::vector<sstable_to_delete> ssts) {
    auto shard = engine().cpu_id();
    return smp::submit_to(0, [=] {
-        return do_delete_atomically(ssts, shard);
+        return g_atomic_deletion_manager.delete_atomically(ssts, shard);
    });
 }

@@ -2540,16 +2493,8 @@ delete_atomically(std::vector<shared_sstable> ssts) {
    return delete_atomically(std::move(sstables_to_delete_atomically));
 }

-void
-cancel_atomic_deletions() {
-    g_atomic_deletions_cancelled = true;
-    for (auto&& pd : g_atomic_deletion_sets) {
-        for (auto&& c : pd->completions) {
-            c->set_exception(atomic_deletion_cancelled(pd->names));
-        }
-    }
-    g_atomic_deletion_sets.clear();
-    g_shards_agreeing_to_delete_sstable.clear();
+void cancel_atomic_deletions() {
+    g_atomic_deletion_manager.cancel_atomic_deletions();
 }

 atomic_deletion_cancelled::atomic_deletion_cancelled(std::vector<sstring> names)
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -48,6 +48,7 @@
 #include "mutation_reader.hh"
 #include "query-request.hh"
 #include "compound_compat.hh"
+#include "atomic_deletion.hh"

 namespace sstables {

@@ -130,6 +131,7 @@ public:
        Statistics,
        TemporaryTOC,
        TemporaryStatistics,
+        Unknown,
    };
    enum class version_types { ka, la };
    enum class format_types { big };
@@ -221,6 +223,8 @@ public:
    static format_types format_from_sstring(sstring& s);
    static const sstring filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation,
                                  format_types format, component_type component);
+    static const sstring filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation,
+                                  format_types format, sstring component);
    // WARNING: it should only be called to remove components of a sstable with
    // a temporary TOC file.
    static future<> remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64_t generation,
@@ -358,6 +362,8 @@ public:
        return _collector;
    }

+    std::vector<std::pair<component_type, sstring>> all_components() const;
+
    future<> create_links(sstring dir, int64_t generation) const;

    future<> create_links(sstring dir) const {
@@ -394,6 +400,7 @@ private:
    static std::unordered_map<component_type, sstring, enum_hash<component_type>> _component_map;

    std::unordered_set<component_type, enum_hash<component_type>> _components;
+    std::vector<sstring> _unrecognized_components;

    bool _shared = true;  // across shards; safe default
    compression _compression;
@@ -688,14 +695,6 @@ future<> await_background_jobs();
 // Invokes await_background_jobs() on all shards
 future<> await_background_jobs_on_all_shards();

-struct sstable_to_delete {
-    sstable_to_delete(sstring name, bool shared) : name(std::move(name)), shared(shared) {}
-    sstring name;
-    bool shared = false;
-    friend std::ostream& operator<<(std::ostream& os, const sstable_to_delete& std);
-};
-
-
 // When we compact sstables, we have to atomically instantiate the new
 // sstable and delete the old ones.  Otherwise, if we compact A+B into C,
 // and if A contained some data that was tombstoned by B, and if B was
@@ -714,17 +713,6 @@ struct sstable_to_delete {
 future<> delete_atomically(std::vector<shared_sstable> ssts);
 future<> delete_atomically(std::vector<sstable_to_delete> ssts);

-class atomic_deletion_cancelled : public std::exception {
-    std::string _msg;
-public:
-    explicit atomic_deletion_cancelled(std::vector<sstring> names);
-    template <typename StringRange>
-    explicit atomic_deletion_cancelled(StringRange range)
-            : atomic_deletion_cancelled(std::vector<sstring>{range.begin(), range.end()}) {
-    }
-    const char* what() const noexcept override;
-};
-
 // Cancel any deletions scheduled by delete_atomically() and make their
 // futures complete (with an atomic_deletion_cancelled exception).
 void cancel_atomic_deletions();
@@ -769,7 +757,7 @@ class sstable_writer {
    bool _backup;
    bool _leave_unsealed;
    bool _compression_enabled;
-    shared_ptr<file_writer> _writer;
+    std::unique_ptr<file_writer> _writer;
    stdx::optional<components_writer> _components_writer;
 private:
    void prepare_file_writer();
@@ -777,6 +765,10 @@ private:
 public:
    sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
                   uint64_t max_sstable_size, bool backup, bool leave_unsealed, const io_priority_class& pc);
+    ~sstable_writer();
+    sstable_writer(sstable_writer&& o) : _sst(o._sst), _schema(o._schema), _pc(o._pc), _backup(o._backup),
+            _leave_unsealed(o._leave_unsealed), _compression_enabled(o._compression_enabled), _writer(std::move(o._writer)),
+            _components_writer(std::move(o._components_writer)) {}
    void consume_new_partition(const dht::decorated_key& dk) { return _components_writer->consume_new_partition(dk); }
    void consume(tombstone t) { _components_writer->consume(t); }
    stop_iteration consume(static_row&& sr) { return _components_writer->consume(std::move(sr)); }
--- a/test.py
+++ b/test.py
@@ -39,6 +39,7 @@ boost_tests = [
    'storage_proxy_test',
    'schema_change_test',
    'sstable_mutation_test',
+    'sstable_atomic_deletion_test',
    'commitlog_test',
    'hash_test',
    'test-serialization',
--- a/tests/commitlog_test.cc
+++ b/tests/commitlog_test.cc
@@ -53,8 +53,10 @@ static future<> cl_test(commitlog::config cfg, Func && f) {
    cfg.commit_log_location = tmp.path;
    return commitlog::create_commitlog(cfg).then([f = std::forward<Func>(f)](commitlog log) mutable {
        return do_with(std::move(log), [f = std::forward<Func>(f)](commitlog& log) {
-            return futurize<std::result_of_t<Func(commitlog&)>>::apply(f, log).finally([&log] {
-                return log.clear();
+            return futurize_apply(f, log).finally([&log] {
+                return log.shutdown().then([&log] {
+                    return log.clear();
+                });
            });
        });
    }).finally([tmp = std::move(tmp)] {
@@ -277,6 +279,21 @@ SEASTAR_TEST_CASE(test_commitlog_delete_when_over_disk_limit) {
 }

 SEASTAR_TEST_CASE(test_commitlog_reader){
+    static auto count_mutations_in_segment = [] (sstring path) -> future<size_t> {
+        auto count = make_lw_shared<size_t>(0);
+        return db::commitlog::read_log_file(path, [count](temporary_buffer<char> buf, db::replay_position rp) {
+            sstring str(buf.get(), buf.size());
+            BOOST_CHECK_EQUAL(str, "hej bubba cow");
+            (*count)++;
+            return make_ready_future<>();
+        }).then([](auto s) {
+            return do_with(std::move(s), [](auto& s) {
+                return s->done();
+            });
+        }).then([count] {
+            return *count;
+        });
+    };
    commitlog::config cfg;
    cfg.commitlog_segment_size_in_mb = 1;
    return cl_test(cfg, [](commitlog& log) {
@@ -309,18 +326,19 @@ SEASTAR_TEST_CASE(test_commitlog_reader){
                        if (i == segments.end()) {
                            throw std::runtime_error("Did not find expected log file");
                        }
-                        return db::commitlog::read_log_file(*i, [count2](temporary_buffer<char> buf, db::replay_position rp) {
-                                    sstring str(buf.get(), buf.size());
-                                    BOOST_CHECK_EQUAL(str, "hej bubba cow");
-                                    (*count2)++;
-                                    return make_ready_future<>();
-                                }).then([](auto s) {
-                                    return do_with(std::move(s), [](auto& s) {
-                                        return s->done();
-                                    });
+                        return *i;
+                    }).then([&log, count] (sstring segment_path) {
+                        // Check reading from an unsynced segment
+                        return count_mutations_in_segment(segment_path).then([count] (size_t replay_count) {
+                            BOOST_CHECK_GE(*count, replay_count);
+                        }).then([&log, count, segment_path] {
+                            return log.sync_all_segments().then([count, segment_path] {
+                                // Check reading from a synced segment
+                                return count_mutations_in_segment(segment_path).then([count] (size_t replay_count) {
+                                    BOOST_CHECK_EQUAL(*count, replay_count);
                                });
-                    }).then([count, count2] {
-                        BOOST_CHECK_EQUAL(*count, *count2);
+                            });
+                        });
                    });
        });
 }
--- a/tests/logalloc_test.cc
+++ b/tests/logalloc_test.cc
@@ -30,7 +30,9 @@
 #include <seastar/core/timer.hh>
 #include <seastar/core/sleep.hh>
 #include <seastar/tests/test-utils.hh>
+#include <seastar/util/defer.hh>
 #include <deque>
+#include "utils/phased_barrier.hh"

 #include "utils/logalloc.hh"
 #include "utils/managed_ref.hh"
@@ -530,11 +532,7 @@ inline void quiesce(FutureType&& fut) {
    // a request may be broken into many continuations. While we could just yield many times, the
    // exact amount needed to guarantee execution would be dependent on the internals of the
    // implementation, we want to avoid that.
-    timer<> tmr;
-    tmr.set_callback([] { BOOST_FAIL("The future we were waiting for took too long to get ready"); });
-    tmr.arm(2s);
-    fut.get();
-    tmr.cancel();
+    with_timeout(lowres_clock::now() + 2s, std::move(fut)).get();
 }

 // Simple RAII structure that wraps around a region_group
@@ -860,15 +858,22 @@ class test_reclaimer: public region_group_reclaimer {
    region_group _rg;
    std::vector<size_t> _reclaim_sizes;
    bool _shutdown = false;
+    shared_promise<> _unleash_reclaimer;
+    seastar::gate _reclaimers_done;
 public:
-    virtual void start_reclaiming() override {
-        while (this->under_pressure()) {
-            size_t reclaimed = test_async_reclaim_region::from_region(_rg.get_largest_region()).evict();
-            _result_accumulator->_reclaim_sizes.push_back(reclaimed);
-        }
+    virtual void start_reclaiming() noexcept override {
+        with_gate(_reclaimers_done, [this] {
+            return _unleash_reclaimer.get_shared_future().then([this] {
+                while (this->under_pressure()) {
+                    size_t reclaimed = test_async_reclaim_region::from_region(_rg.get_largest_region()).evict();
+                    _result_accumulator->_reclaim_sizes.push_back(reclaimed);
+                }
+            });
+        });
    }

    ~test_reclaimer() {
+        _reclaimers_done.close().get();
        _rg.shutdown().get();
    }

@@ -882,6 +887,10 @@ public:

    test_reclaimer(size_t threshold) : region_group_reclaimer(threshold), _result_accumulator(this), _rg(*this) {}
    test_reclaimer(test_reclaimer& parent, size_t threshold) : region_group_reclaimer(threshold), _result_accumulator(&parent), _rg(&parent._rg, *this) {}
+
+    void unleash() {
+        _unleash_reclaimer.set_value();
+    }
 };

 SEASTAR_TEST_CASE(test_region_groups_basic_throttling_simple_active_reclaim) {
@@ -889,6 +898,7 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_simple_active_reclaim) {
        // allocate a single region to exhaustion, and make sure active reclaim is activated.
        test_reclaimer simple(logalloc::segment_size);
        test_async_reclaim_region simple_region(simple.rg(), logalloc::segment_size);
+        simple.unleash();

        // Can't run this function until we have reclaimed something
        auto fut = simple.rg().run_when_memory_available([] {});
@@ -913,6 +923,7 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_worst_offen
        test_async_reclaim_region small_region(simple.rg(), logalloc::segment_size);
        test_async_reclaim_region medium_region(simple.rg(), 2 * logalloc::segment_size);
        test_async_reclaim_region big_region(simple.rg(), 3 * logalloc::segment_size);
+        simple.unleash();

        // Can't run this function until we have reclaimed
        auto fut = simple.rg().run_when_memory_available([&simple] {
@@ -942,6 +953,9 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_leaf_offend
        test_async_reclaim_region small_region(small_leaf.rg(), logalloc::segment_size);
        test_async_reclaim_region medium_region(root.rg(), 2 * logalloc::segment_size);
        test_async_reclaim_region big_region(large_leaf.rg(), 3 * logalloc::segment_size);
+        root.unleash();
+        large_leaf.unleash();
+        small_leaf.unleash();

        // Can't run this function until we have reclaimed. Try at the root, and we'll make sure
        // that the leaves are forced correctly.
@@ -968,6 +982,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_ancestor_bl
        test_reclaimer leaf(root, logalloc::segment_size);

        test_async_reclaim_region root_region(root.rg(), logalloc::segment_size);
+        root.unleash();
+        leaf.unleash();

        // Can't run this function until we have reclaimed. Try at the leaf, and we'll make sure
        // that the root reclaims
@@ -993,6 +1009,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_big_region_
        test_async_reclaim_region root_region(root.rg(), 4 * logalloc::segment_size);
        test_async_reclaim_region big_leaf_region(leaf.rg(), 3 * logalloc::segment_size);
        test_async_reclaim_region small_leaf_region(leaf.rg(), 2 * logalloc::segment_size);
+        root.unleash();
+        leaf.unleash();

        auto fut = root.rg().run_when_memory_available([&root] {
            BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 3);
@@ -1019,6 +1037,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_no_double_r
        test_reclaimer leaf(root, logalloc::segment_size);

        test_async_reclaim_region leaf_region(leaf.rg(), logalloc::segment_size);
+        root.unleash();
+        leaf.unleash();

        auto fut_root = root.rg().run_when_memory_available([&root] {
            BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 1);
@@ -1038,3 +1058,117 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_no_double_r
        BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[0], logalloc::segment_size);
    });
 }
+
+// Reproduces issue #2021
+SEASTAR_TEST_CASE(test_no_crash_when_a_lot_of_requests_released_which_change_region_group_size) {
+    return seastar::async([] {
+#ifndef DEFAULT_ALLOCATOR // Because we need memory::stats().free_memory();
+        logging::logger_registry().set_logger_level("lsa", seastar::log_level::debug);
+
+        auto free_space = memory::stats().free_memory();
+        size_t threshold = size_t(0.75 * free_space);
+        region_group_reclaimer recl(threshold, threshold);
+        region_group gr(recl);
+        auto close_gr = defer([&gr] { gr.shutdown().get(); });
+        region r(gr);
+
+        with_allocator(r.allocator(), [&] {
+            std::vector<managed_bytes> objs;
+
+            r.make_evictable([&] {
+                if (objs.empty()) {
+                    return memory::reclaiming_result::reclaimed_nothing;
+                }
+                with_allocator(r.allocator(), [&] {
+                    objs.pop_back();
+                });
+                return memory::reclaiming_result::reclaimed_something;
+            });
+
+            auto fill_to_pressure = [&] {
+                while (!recl.under_pressure()) {
+                    objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), 1024));
+                }
+            };
+
+            utils::phased_barrier request_barrier;
+            auto wait_for_requests = defer([&] { request_barrier.advance_and_await().get(); });
+
+            for (int i = 0; i < 1000000; ++i) {
+                fill_to_pressure();
+                future<> f = gr.run_when_memory_available([&, op = request_barrier.start()] {
+                    // Trigger group size change (Refs issue #2021)
+                    gr.update(-10);
+                    gr.update(+10);
+                });
+                BOOST_REQUIRE(!f.available());
+            }
+
+            // Release
+            while (recl.under_pressure()) {
+                objs.pop_back();
+            }
+        });
+#endif
+    });
+}
+
+SEASTAR_TEST_CASE(test_reclaiming_runs_as_long_as_there_is_soft_pressure) {
+    return seastar::async([] {
+        size_t hard_threshold = logalloc::segment_size * 8;
+        size_t soft_threshold = hard_threshold / 2;
+
+        class reclaimer : public region_group_reclaimer {
+            bool _reclaim = false;
+        protected:
+            void start_reclaiming() noexcept override {
+                _reclaim = true;
+            }
+
+            void stop_reclaiming() noexcept override {
+                _reclaim = false;
+            }
+        public:
+            reclaimer(size_t hard_threshold, size_t soft_threshold)
+                : region_group_reclaimer(hard_threshold, soft_threshold)
+            { }
+            bool reclaiming() const { return _reclaim; };
+        };
+
+        reclaimer recl(hard_threshold, soft_threshold);
+        region_group gr(recl);
+        auto close_gr = defer([&gr] { gr.shutdown().get(); });
+        region r(gr);
+
+        with_allocator(r.allocator(), [&] {
+            std::vector<managed_bytes> objs;
+
+            BOOST_REQUIRE(!recl.reclaiming());
+
+            while (!recl.over_soft_limit()) {
+                objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), logalloc::segment_size));
+            }
+
+            BOOST_REQUIRE(recl.reclaiming());
+
+            while (!recl.under_pressure()) {
+                objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), logalloc::segment_size));
+            }
+
+            BOOST_REQUIRE(recl.reclaiming());
+
+            while (recl.under_pressure()) {
+                objs.pop_back();
+            }
+
+            BOOST_REQUIRE(recl.over_soft_limit());
+            BOOST_REQUIRE(recl.reclaiming());
+
+            while (recl.over_soft_limit()) {
+                objs.pop_back();
+            }
+
+            BOOST_REQUIRE(!recl.reclaiming());
+        });
+    });
+}
--- a/tests/memtable_test.cc
+++ b/tests/memtable_test.cc
@@ -141,7 +141,7 @@ SEASTAR_TEST_CASE(test_virtual_dirty_accounting_on_flush) {
                .with_column("col", bytes_type, column_kind::regular_column)
                .build();

-        memtable_dirty_memory_manager mgr;
+        dirty_memory_manager mgr;

        auto mt = make_lw_shared<memtable>(s, &mgr.region_group());

@@ -279,7 +279,7 @@ SEASTAR_TEST_CASE(test_segment_migration_during_flush) {
                .with_column("col", bytes_type, column_kind::regular_column)
                .build();

-        memtable_dirty_memory_manager mgr;
+        dirty_memory_manager mgr;

        auto mt = make_lw_shared<memtable>(s, &mgr.region_group());

--- a/tests/sstable_atomic_deletion_test.cc
+++ b/tests/sstable_atomic_deletion_test.cc
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2015 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "sstables/atomic_deletion.hh"
+#include <seastar/tests/test-utils.hh>
+#include <deque>
+#include <boost/range/numeric.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+
+using namespace sstables;
+
+
+class atomic_deletion_test_env {
+public:
+    using event = std::function<future<> (atomic_deletion_test_env& adm)>;
+private:
+    struct a_hash {
+        size_t operator()(const std::unordered_set<sstring>& s) const {
+            auto h = std::hash<sstring>();
+            return boost::accumulate(s | boost::adaptors::transformed(h), size_t(0)); // sue me
+        }
+    };
+    atomic_deletion_manager _adm;
+    std::deque<event> _events;
+    std::unordered_set<std::unordered_set<sstring>, a_hash> _deletes;
+    semaphore _deletion_counter { 0 };
+private:
+    future<> delete_sstables(std::vector<sstring> names) {
+        auto&& s1 = boost::copy_range<std::unordered_set<sstring>>(names);
+        _deletes.insert(s1);
+        _deletion_counter.signal();
+        return make_ready_future<>();
+    }
+public:
+    explicit atomic_deletion_test_env(unsigned shard_count, std::vector<event> events)
+            : _adm(shard_count, [this] (std::vector<sstring> names) {
+                    return delete_sstables(names);
+               })
+            , _events(events.begin(), events.end()) {
+    }
+    void expect_no_deletion() {
+        BOOST_REQUIRE(_deletes.empty());
+    }
+    future<> schedule_delete(std::vector<sstable_to_delete> names, unsigned shard) {
+        _adm.delete_atomically(names, shard).discard_result();
+        return make_ready_future<>();
+    }
+    future<> expect_deletion(std::vector<sstring> names) {
+        return _deletion_counter.wait().then([this, names] {
+            auto&& s1 = boost::copy_range<std::unordered_set<sstring>>(names);
+            auto erased = _deletes.erase(s1);
+            BOOST_REQUIRE_EQUAL(erased, 1);
+        });
+    }
+    future<> test() {
+        // run all _events sequentially
+        return repeat([this] {
+            if (_events.empty()) {
+                return make_ready_future<stop_iteration>(stop_iteration::yes);
+            }
+            auto ev = std::move(_events.front());
+            _events.pop_front();
+            return ev(*this).then([] {
+                return stop_iteration::no;
+            });
+        });
+    }
+};
+
+future<> test_atomic_deletion_manager(unsigned shards, std::vector<atomic_deletion_test_env::event> events) {
+    auto env = make_lw_shared<atomic_deletion_test_env>(shards, events);
+    return env->test().finally([env] {});
+}
+
+atomic_deletion_test_env::event
+delete_many(std::vector<sstable_to_delete> v, unsigned shard) {
+    return [v, shard] (atomic_deletion_test_env& env) {
+        // verify we didn't have an early delete from previous deletion
+        env.expect_no_deletion();
+        return env.schedule_delete(v, shard);
+    };
+}
+
+atomic_deletion_test_env::event
+delete_one(sstable_to_delete s, unsigned shard) {
+    return delete_many({s}, shard);
+}
+
+atomic_deletion_test_env::event
+expect_many(std::vector<sstring> names) {
+    return [names] (atomic_deletion_test_env& env) {
+        return env.expect_deletion(names);
+    };
+}
+
+atomic_deletion_test_env::event
+expect_one(sstring name) {
+    return expect_many({name});
+}
+
+SEASTAR_TEST_CASE(test_single_shard_single_sstable) {
+    return test_atomic_deletion_manager(1, {
+            delete_one({"1", false}, 0),
+            expect_one("1"),
+            delete_one({"2", true}, 0),
+            expect_one("2"),
+    });
+}
+
+SEASTAR_TEST_CASE(test_multi_shard_single_sstable) {
+    return test_atomic_deletion_manager(3, {
+            delete_one({"1", true}, 0),
+            delete_one({"1", true}, 1),
+            delete_one({"1", true}, 2),
+            expect_one("1"),
+            delete_one({"2", false}, 1),
+            expect_one("2"),
+    });
+}
+
+SEASTAR_TEST_CASE(test_nonshared_compaction) {
+    return test_atomic_deletion_manager(5, {
+            delete_many({{"1", false}, {"2", false}, {"3", false}}, 2),
+            expect_many({"1", "2", "3"}),
+    });
+}
+
+SEASTAR_TEST_CASE(test_shared_compaction) {
+    return test_atomic_deletion_manager(3, {
+            delete_one({"1", true}, 0),
+            delete_many({{"1", true}, {"2", false}, {"3", false}}, 2),
+            delete_one({"1", true}, 1),
+            expect_many({"1", "2", "3"}),
+    });
+}
+
+SEASTAR_TEST_CASE(test_overlapping_compaction) {
+    return test_atomic_deletion_manager(3, {
+            delete_one({"1", true}, 0),
+            delete_one({"3", true}, 0),
+            delete_many({{"1", true}, {"2", false}, {"3", true}}, 2),
+            delete_one({"1", true}, 1),
+            delete_many({{"3", true}, {"4", false}}, 1),
+            expect_many({"1", "2", "3", "4"}),
+    });
+}
+
+
+#include "disk-error-handler.hh"
+
+thread_local disk_error_signal_type commit_error;
+thread_local disk_error_signal_type general_disk_error;
--- a/tests/sstable_datafile_test.cc
+++ b/tests/sstable_datafile_test.cc
@@ -3031,3 +3031,22 @@ SEASTAR_TEST_CASE(test_partition_skipping) {
            .produces_end_of_stream();
    });
 }
+
+SEASTAR_TEST_CASE(test_unknown_component) {
+    return seastar::async([] {
+        auto tmp = make_lw_shared<tmpdir>();
+        auto sstp = reusable_sst(uncompressed_schema(), "tests/sstables/unknown_component", 1).get0();
+        sstp->create_links(tmp->path).get();
+        // check that create_links() moved unknown component to new dir
+        BOOST_REQUIRE(file_exists(tmp->path + "/la-1-big-UNKNOWN.txt").get0());
+
+        sstp = reusable_sst(uncompressed_schema(), tmp->path, 1).get0();
+        sstp->set_generation(2).get();
+        BOOST_REQUIRE(!file_exists(tmp->path +  "/la-1-big-UNKNOWN.txt").get0());
+        BOOST_REQUIRE(file_exists(tmp->path + "/la-2-big-UNKNOWN.txt").get0());
+
+        sstables::delete_atomically({sstp}).get();
+        // assure unknown component is deleted
+        BOOST_REQUIRE(!file_exists(tmp->path + "/la-2-big-UNKNOWN.txt").get0());
+    });
+}
--- a/tests/sstables/unknown_component/la-1-big-CRC.db
+++ b/tests/sstables/unknown_component/la-1-big-CRC.db
--- a/tests/sstables/unknown_component/la-1-big-Data.db
+++ b/tests/sstables/unknown_component/la-1-big-Data.db
--- a/tests/sstables/unknown_component/la-1-big-Digest.sha1
+++ b/tests/sstables/unknown_component/la-1-big-Digest.sha1
@@ -0,0 +1 @@
+748507322
--- a/tests/sstables/unknown_component/la-1-big-Filter.db
+++ b/tests/sstables/unknown_component/la-1-big-Filter.db
--- a/tests/sstables/unknown_component/la-1-big-Index.db
+++ b/tests/sstables/unknown_component/la-1-big-Index.db
--- a/tests/sstables/unknown_component/la-1-big-Statistics.db
+++ b/tests/sstables/unknown_component/la-1-big-Statistics.db
--- a/tests/sstables/unknown_component/la-1-big-Summary.db
+++ b/tests/sstables/unknown_component/la-1-big-Summary.db
--- a/tests/sstables/unknown_component/la-1-big-TOC.txt
+++ b/tests/sstables/unknown_component/la-1-big-TOC.txt
@@ -0,0 +1,9 @@
+Data.db
+Filter.db
+CRC.db
+Statistics.db
+Summary.db
+Digest.sha1
+Index.db
+TOC.txt
+UNKNOWN.txt
--- a/tests/sstables/unknown_component/la-1-big-UNKNOWN.txt
+++ b/tests/sstables/unknown_component/la-1-big-UNKNOWN.txt
--- a/thrift/handler.cc
+++ b/thrift/handler.cc
@@ -1450,12 +1450,13 @@ private:
    class column_visitor : public Aggregator {
        const schema& _s;
        const query::partition_slice& _slice;
-        uint32_t _cell_limit;
+        const uint32_t _cell_limit;
+        uint32_t _current_cell_limit;
        std::vector<std::pair<std::string, typename Aggregator::type>> _aggregation;
        typename Aggregator::type* _current_aggregation;
    public:
        column_visitor(const schema& s, const query::partition_slice& slice, uint32_t cell_limit)
-                : _s(s), _slice(slice), _cell_limit(cell_limit)
+                : _s(s), _slice(slice), _cell_limit(cell_limit), _current_cell_limit(0)
        { }
        std::vector<std::pair<std::string, typename Aggregator::type>>&& release() {
            return std::move(_aggregation);
@@ -1468,6 +1469,7 @@ private:
        void accept_new_partition(const partition_key& key, uint32_t row_count) {
            _aggregation.emplace_back(partition_key_to_string(_s, key), typename Aggregator::type());
            _current_aggregation = &_aggregation.back().second;
+            _current_cell_limit = _cell_limit;
        }
        void accept_new_partition(uint32_t row_count) {
            // We always ask for the partition_key to be sent in query_opts().
@@ -1476,19 +1478,19 @@ private:
        void accept_new_row(const clustering_key_prefix& key, const query::result_row_view& static_row, const query::result_row_view& row) {
            auto it = row.iterator();
            auto cell = it.next_atomic_cell();
-            if (cell && _cell_limit > 0) {
+            if (cell && _current_cell_limit > 0) {
                bytes column_name = composite::serialize_value(key.components(), _s.thrift().has_compound_comparator());
                Aggregator::on_column(_current_aggregation, column_name, *cell);
-                _cell_limit -= 1;
+                _current_cell_limit -= 1;
            }
        }
        void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) {
            auto it = row.iterator();
            for (auto&& id : _slice.regular_columns) {
                auto cell = it.next_atomic_cell();
-                if (cell && _cell_limit > 0) {
+                if (cell && _current_cell_limit > 0) {
                    Aggregator::on_column(_current_aggregation, _s.regular_column_at(id).name(), *cell);
-                    _cell_limit -= 1;
+                    _current_cell_limit -= 1;
                }
            }
        }
--- a/tracing/tracing.cc
+++ b/tracing/tracing.cc
@@ -55,9 +55,10 @@ std::vector<sstring> trace_type_names = {
    "REPAIR"
 };

-tracing::tracing(const sstring& tracing_backend_helper_class_name)
+tracing::tracing(sstring tracing_backend_helper_class_name)
        : _write_timer([this] { write_timer_callback(); })
        , _thread_name(seastar::format("shard {:d}", engine().cpu_id()))
+        , _tracing_backend_helper_class_name(std::move(tracing_backend_helper_class_name))
        , _registrations{
            scollectd::add_polled_metric(scollectd::type_instance_id("tracing"
                    , scollectd::per_cpu_plugin_instance
@@ -93,27 +94,23 @@ tracing::tracing(const sstring& tracing_backend_helper_class_name)
                    , scollectd::make_typed(scollectd::data_type::GAUGE, _flushing_records))}
        , _gen(std::random_device()())
        , _slow_query_duration_threshold(default_slow_query_duraion_threshold)
-        , _slow_query_record_ttl(default_slow_query_record_ttl) {
-    try {
-        _tracing_backend_helper_ptr = create_object<i_tracing_backend_helper>(tracing_backend_helper_class_name, *this);
-    } catch (no_such_class& e) {
-        tracing_logger.error("Can't create tracing backend helper {}: not supported", tracing_backend_helper_class_name);
-        throw;
-    } catch (...) {
-        throw;
-    }
+        , _slow_query_record_ttl(default_slow_query_record_ttl) {}
+
+future<> tracing::create_tracing(sstring tracing_backend_class_name) {
+    return tracing_instance().start(std::move(tracing_backend_class_name));
 }

-future<> tracing::create_tracing(const sstring& tracing_backend_class_name) {
-    return tracing_instance().start(tracing_backend_class_name).then([] {
-        return tracing_instance().invoke_on_all([] (tracing& local_tracing) {
-            return local_tracing.start();
-        });
+future<> tracing::start_tracing() {
+    return tracing_instance().invoke_on_all([] (tracing& local_tracing) {
+        return local_tracing.start();
    });
 }

 trace_state_ptr tracing::create_session(trace_type type, trace_state_props_set props) {
-    trace_state_ptr tstate;
+    if (!started()) {
+        return trace_state_ptr();
+    }
+
    try {
        // Don't create a session if its records are likely to be dropped
        if (!may_create_new_session()) {
@@ -129,6 +126,10 @@ trace_state_ptr tracing::create_session(trace_type type, trace_state_props_set p
 }

 trace_state_ptr tracing::create_session(const trace_info& secondary_session_info) {
+    if (!started()) {
+        return trace_state_ptr();
+    }
+
    try {
        // Don't create a session if its records are likely to be dropped
        if (!may_create_new_session(secondary_session_info.session_id)) {
@@ -144,7 +145,17 @@ trace_state_ptr tracing::create_session(const trace_info& secondary_session_info
 }

 future<> tracing::start() {
+    try {
+        _tracing_backend_helper_ptr = create_object<i_tracing_backend_helper>(_tracing_backend_helper_class_name, *this);
+    } catch (no_such_class& e) {
+        tracing_logger.error("Can't create tracing backend helper {}: not supported", _tracing_backend_helper_class_name);
+        throw;
+    } catch (...) {
+        throw;
+    }
+
    return _tracing_backend_helper_ptr->start().then([this] {
+        _down = false;
        _write_timer.arm(write_period);
    });
 }
--- a/tracing/tracing.hh
+++ b/tracing/tracing.hh
@@ -345,10 +345,15 @@ private:

    records_bulk _pending_for_write_records_bulk;
    timer<lowres_clock> _write_timer;
-    bool _down = false;
+    // _down becomes FALSE after the local service is fully initialized and
+    // tracing records are allowed to be created and collected. It becomes TRUE
+    // after the shutdown() call and prevents further write attempts to I/O
+    // backend.
+    bool _down = true;
    bool _slow_query_logging_enabled = false;
    std::unique_ptr<i_tracing_backend_helper> _tracing_backend_helper_ptr;
    sstring _thread_name;
+    sstring _tracing_backend_helper_class_name;
    scollectd::registrations _registrations;
    double _trace_probability = 0.0; // keep this one for querying purposes
    uint64_t _normalized_trace_probability = 0;
@@ -376,8 +381,13 @@ public:
        return tracing_instance().local();
    }

-    static future<> create_tracing(const sstring& tracing_backend_helper_class_name);
-    tracing(const sstring& tracing_backend_helper_class_name);
+    bool started() const {
+        return !_down;
+    }
+
+    static future<> create_tracing(sstring tracing_backend_helper_class_name);
+    static future<> start_tracing();
+    tracing(sstring tracing_backend_helper_class_name);

    // Initialize a tracing backend (e.g. tracing_keyspace or logstash)
    future<> start();
--- a/transport/server.cc
+++ b/transport/server.cc
@@ -1504,7 +1504,11 @@ std::vector<char> cql_server::response::compress_lz4(const std::vector<char>& bo
    output[1] = (input_len >> 16) & 0xFF;
    output[2] = (input_len >> 8) & 0xFF;
    output[3] = input_len & 0xFF;
+#ifdef HAVE_LZ4_COMPRESS_DEFAULT
+    auto ret = LZ4_compress_default(input, output + 4, input_len, LZ4_compressBound(input_len));
+#else
    auto ret = LZ4_compress(input, output + 4, input_len);
+#endif
    if (ret == 0) {
        throw std::runtime_error("CQL frame LZ4 compression failure");
    }
--- a/utils/histogram.hh
+++ b/utils/histogram.hh
@@ -39,8 +39,8 @@ class moving_average {
 public:
    moving_average(latency_counter::duration interval, latency_counter::duration tick_interval) :
        _tick_interval(tick_interval) {
-        _alpha = 1 - std::exp(-std::chrono::duration_cast<std::chrono::nanoseconds>(interval).count()/
-                static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(tick_interval).count()));
+        _alpha = 1 - std::exp(-std::chrono::duration_cast<std::chrono::seconds>(tick_interval).count()/
+                static_cast<double>(std::chrono::duration_cast<std::chrono::seconds>(interval).count()));
    }

    void add(uint64_t val = 1) {
@@ -48,7 +48,7 @@ public:
    }

    void update() {
-        double instant_rate = _count / static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(_tick_interval).count());
+        double instant_rate = _count / static_cast<double>(std::chrono::duration_cast<std::chrono::seconds>(_tick_interval).count());
        if (_initialized) {
            _rate += (_alpha * (instant_rate - _rate));
        } else {
@@ -70,7 +70,8 @@ public:
    }
 };

-class ihistogram {
+template <typename Unit>
+class basic_ihistogram {
 public:
    // count holds all the events
    int64_t count;
@@ -84,12 +85,13 @@ public:
    double variance;
    int64_t sample_mask;
    boost::circular_buffer<int64_t> sample;
-    ihistogram(size_t size = 1024, int64_t _sample_mask = 0x80)
+    basic_ihistogram(size_t size = 1024, int64_t _sample_mask = 0x80)
            : count(0), total(0), min(0), max(0), sum(0), started(0), mean(0), variance(0),
              sample_mask(_sample_mask), sample(
                    size) {
    }
-    void mark(int64_t value) {
+    void mark(int64_t ns_value) {
+        auto value = std::chrono::duration_cast<Unit>(std::chrono::nanoseconds(ns_value)).count();
        if (total == 0 || value < min) {
            min = value;
        }
@@ -131,7 +133,7 @@ public:
    /**
     * Set the latency according to the sample rate.
     */
-    ihistogram& set_latency(latency_counter& lc) {
+    basic_ihistogram& set_latency(latency_counter& lc) {
        if (should_sample()) {
            lc.start();
        }
@@ -144,7 +146,7 @@ public:
     * Increment the total number of events without
     * sampling the value.
     */
-    ihistogram& inc() {
+    basic_ihistogram& inc() {
        count++;
        return *this;
    }
@@ -157,7 +159,7 @@ public:
        return a * a;
    }

-    ihistogram& operator +=(const ihistogram& o) {
+    basic_ihistogram& operator +=(const basic_ihistogram& o) {
        if (count == 0) {
            *this = o;
        } else if (o.count > 0) {
@@ -190,14 +192,18 @@ public:
        return mean * count;
    }

-    friend ihistogram operator +(ihistogram a, const ihistogram& b);
+    template <typename U>
+    friend basic_ihistogram<U> operator +(basic_ihistogram<U> a, const basic_ihistogram<U>& b);
 };

-inline ihistogram operator +(ihistogram a, const ihistogram& b) {
+template <typename Unit>
+inline basic_ihistogram<Unit> operator +(basic_ihistogram<Unit> a, const basic_ihistogram<Unit>& b) {
    a += b;
    return a;
 }

+using ihistogram = basic_ihistogram<std::chrono::microseconds>;
+
 struct rate_moving_average {
    uint64_t count = 0;
    double rates[3] = {0};
@@ -222,7 +228,7 @@ class timed_rate_moving_average {
    static constexpr latency_counter::duration tick_interval() {
        return std::chrono::seconds(10);
    }
-    moving_average rates[3] = {{tick_interval(), std::chrono::minutes(1)}, {tick_interval(), std::chrono::minutes(5)}, {tick_interval(), std::chrono::minutes(15)}};
+    moving_average rates[3] = {{std::chrono::minutes(1), tick_interval()}, {std::chrono::minutes(5), tick_interval()}, {std::chrono::minutes(15), tick_interval()}};
    latency_counter::time_point start_time;
    timer<> _timer;

@@ -246,7 +252,7 @@ public:
    rate_moving_average rate() const {
        rate_moving_average res;
        if (_count > 0) {
-            double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(latency_counter::now() - start_time).count();
+            double elapsed = std::chrono::duration_cast<std::chrono::seconds>(latency_counter::now() - start_time).count();
            res.mean_rate = (_count / elapsed);
        }
        res.count = _count;
--- a/utils/logalloc.cc
+++ b/utils/logalloc.cc
@@ -2069,56 +2069,6 @@ uint64_t region_group::top_region_evictable_space() const {
    return _regions.empty() ? 0 : _regions.top()->evictable_occupancy().total_space();
 }

-void region_group::release_requests() noexcept {
-    // The later() statement is here  to avoid executing the function in update() context. But
-    // also guarantees that we won't dominate the CPU if we have many requests to release.
-    //
-    // However, both with_gate() and later() can ultimately call to schedule() and consequently
-    // allocate memory, which (if that allocation triggers a compaction - that frees memory) would
-    // defeat the very purpose of not executing this on update() context. Allocations should be rare
-    // on those but can happen, so we need to at least make sure they will not reclaim.
-    //
-    // Whatever comes after later() is already in a safe context, so we don't need to keep the lock
-    // alive until we are done with the whole execution - only until later is successfully executed.
-    tracker_reclaimer_lock rl;
-
-    _reclaimer.notify_relief();
-    if (_descendant_blocked_requests) {
-        _descendant_blocked_requests->set_value();
-    }
-    _descendant_blocked_requests = {};
-
-    if (_blocked_requests.empty()) {
-        return;
-    }
-
-    with_gate(_asynchronous_gate, [this, rl = std::move(rl)] () mutable {
-        return later().then([this] {
-            // Check again, we may have executed release_requests() in this mean time from another entry
-            // point (for instance, a descendant notification)
-            if (_blocked_requests.empty()) {
-                return;
-            }
-
-            auto blocked_at = do_for_each_parent(this, [] (auto rg) {
-                return rg->execution_permitted() ? stop_iteration::no : stop_iteration::yes;
-            });
-
-            if (!blocked_at) {
-                auto req = std::move(_blocked_requests.front());
-                _blocked_requests.pop_front();
-                req->allocate();
-                release_requests();
-            } else {
-                // If someone blocked us in the mean time then we can't execute. We need to make
-                // sure that we are listening to notifications, though. It could be that we used to
-                // be blocked on ourselves and now we are blocking on an ancestor
-                subscribe_for_ancestor_available_memory_notification(blocked_at);
-            }
-        });
-    });
-}
-
 region* region_group::get_largest_region() {
    if (!_maximal_rg || _maximal_rg->_regions.empty()) {
        return nullptr;
@@ -2152,6 +2102,88 @@ region_group::del(region_impl* child) {
    update(-child->occupancy().total_space());
 }

+bool
+region_group::execution_permitted() noexcept {
+    return do_for_each_parent(this, [] (auto rg) {
+        return rg->under_pressure() ? stop_iteration::yes : stop_iteration::no;
+    }) == nullptr;
+}
+
+future<>
+region_group::start_releaser() {
+    return later().then([this] {
+        return repeat([this] () noexcept {
+            if (_shutdown_requested) {
+                return make_ready_future<stop_iteration>(stop_iteration::yes);
+            }
+
+            if (!_blocked_requests.empty() && execution_permitted()) {
+                auto req = std::move(_blocked_requests.front());
+                _blocked_requests.pop_front();
+                req->allocate();
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            } else {
+                // Block reclaiming to prevent signal() from being called by reclaimer inside wait()
+                // FIXME: handle allocation failures (not very likely) like allocating_section does
+                tracker_reclaimer_lock rl;
+                return _relief.wait().then([] {
+                    return stop_iteration::no;
+                });
+            }
+        });
+    });
+}
+
+region_group::region_group(region_group *parent, region_group_reclaimer& reclaimer)
+    : _parent(parent)
+    , _reclaimer(reclaimer)
+    , _releaser(reclaimer_can_block() ? start_releaser() : make_ready_future<>())
+{
+    if (_parent) {
+        _parent->add(this);
+    }
+}
+
+bool region_group::reclaimer_can_block() const {
+    return _reclaimer.throttle_threshold() != std::numeric_limits<size_t>::max();
+}
+
+void region_group::notify_relief() {
+    _relief.signal();
+    for (region_group* child : _subgroups) {
+        child->notify_relief();
+    }
+}
+
+void region_group::update(ssize_t delta) {
+    // Most-enclosing group which was relieved.
+    region_group* top_relief = nullptr;
+
+    do_for_each_parent(this, [&top_relief, delta] (region_group* rg) mutable {
+        rg->update_maximal_rg();
+        rg->_total_memory += delta;
+
+        if (rg->_total_memory >= rg->_reclaimer.soft_limit_threshold()) {
+            rg->_reclaimer.notify_soft_pressure();
+        } else {
+            rg->_reclaimer.notify_soft_relief();
+        }
+
+        if (rg->_total_memory > rg->_reclaimer.throttle_threshold()) {
+            rg->_reclaimer.notify_pressure();
+        } else if (rg->_reclaimer.under_pressure()) {
+            rg->_reclaimer.notify_relief();
+            top_relief = rg;
+        }
+
+        return stop_iteration::no;
+    });
+
+    if (top_relief) {
+        top_relief->notify_relief();
+    }
+}
+
 allocating_section::guard::guard()
    : _prev(shard_segment_pool.emergency_reserve_max())
 { }
--- a/utils/logalloc.hh
+++ b/utils/logalloc.hh
@@ -61,34 +61,71 @@ using eviction_fn = std::function<memory::reclaiming_result()>;
 class region_group_reclaimer {
 protected:
    size_t _threshold;
+    size_t _soft_limit;
    bool _under_pressure = false;
-    virtual void start_reclaiming() {}
-    virtual void stop_reclaiming() {}
+    bool _under_soft_pressure = false;
+    // The following restrictions apply to implementations of start_reclaiming() and stop_reclaiming():
+    //
+    //  - must not use any region or region_group objects, because they're invoked synchronously
+    //    with operations on those.
+    //
+    //  - must be noexcept, because they're called on the free path.
+    //
+    //  - the implementation may be called synchronously with any operation
+    //    which allocates memory, because these are called by memory reclaimer.
+    //    In particular, the implementation should not depend on memory allocation
+    //    because that may fail when in reclaiming context.
+    //
+    virtual void start_reclaiming() noexcept {}
+    virtual void stop_reclaiming() noexcept {}
 public:
    bool under_pressure() const {
        return _under_pressure;
    }

-    void notify_pressure() {
-        if (!_under_pressure) {
-            _under_pressure = true;
+    bool over_soft_limit() const {
+        return _under_soft_pressure;
+    }
+
+    void notify_soft_pressure() noexcept {
+        if (!_under_soft_pressure) {
+            _under_soft_pressure = true;
            start_reclaiming();
        }
    }

-    void notify_relief() {
-        if (_under_pressure) {
-            _under_pressure = false;
+    void notify_soft_relief() noexcept {
+        if (_under_soft_pressure) {
+            _under_soft_pressure = false;
            stop_reclaiming();
        }
    }

-    region_group_reclaimer(size_t threshold = std::numeric_limits<size_t>::max()) : _threshold(threshold) {}
+    void notify_pressure() noexcept {
+        _under_pressure = true;
+    }
+
+    void notify_relief() noexcept {
+        _under_pressure = false;
+    }
+
+    region_group_reclaimer()
+        : _threshold(std::numeric_limits<size_t>::max()), _soft_limit(std::numeric_limits<size_t>::max()) {}
+    region_group_reclaimer(size_t threshold)
+        : _threshold(threshold), _soft_limit(threshold) {}
+    region_group_reclaimer(size_t threshold, size_t soft)
+        : _threshold(threshold), _soft_limit(soft) {
+        assert(_soft_limit <= _threshold);
+    }
+
    virtual ~region_group_reclaimer() {}

    size_t throttle_threshold() const {
        return _threshold;
    }
+    size_t soft_limit_threshold() const {
+        return _soft_limit;
+    }
 };

 // Groups regions for the purpose of statistics.  Can be nested.
@@ -190,9 +227,13 @@ class region_group {
    // a different ancestor)
    std::experimental::optional<shared_promise<>> _descendant_blocked_requests = {};

-    region_group* _waiting_on_ancestor = nullptr;
-    seastar::gate _asynchronous_gate;
+    condition_variable _relief;
+    future<> _releaser;
    bool _shutdown_requested = false;
+
+    bool reclaimer_can_block() const;
+    future<> start_releaser();
+    void notify_relief();
 public:
    // When creating a region_group, one can specify an optional throttle_threshold parameter. This
    // parameter won't affect normal allocations, but an API is provided, through the region_group's
@@ -200,17 +241,13 @@ public:
    // the total memory for the region group (and all of its parents) is lower or equal to the
    // region_group's throttle_treshold (and respectively for its parents).
    region_group(region_group_reclaimer& reclaimer = no_reclaimer) : region_group(nullptr, reclaimer) {}
-    region_group(region_group* parent, region_group_reclaimer& reclaimer = no_reclaimer) : _parent(parent), _reclaimer(reclaimer) {
-        if (_parent) {
-            _parent->add(this);
-        }
-    }
+    region_group(region_group* parent, region_group_reclaimer& reclaimer = no_reclaimer);
    region_group(region_group&& o) = delete;
    region_group(const region_group&) = delete;
    ~region_group() {
        // If we set a throttle threshold, we'd be postponing many operations. So shutdown must be
        // called.
-        if (_reclaimer.throttle_threshold() != std::numeric_limits<size_t>::max()) {
+        if (reclaimer_can_block()) {
            assert(_shutdown_requested);
        }
        if (_parent) {
@@ -222,19 +259,7 @@ public:
    size_t memory_used() const {
        return _total_memory;
    }
-    void update(ssize_t delta) {
-        do_for_each_parent(this, [delta] (auto rg) mutable {
-            rg->update_maximal_rg();
-            rg->_total_memory += delta;
-            // It is okay to call release_requests for a region_group that can't allow execution.
-            // But that can generate various spurious messages to groups waiting on us that will be
-            // then woken up just so they can go to wait again. So let's filter that.
-            if (rg->execution_permitted()) {
-                rg->release_requests();
-            }
-            return stop_iteration::no;
-        });
-    }
+    void update(ssize_t delta);

    // It would be easier to call update, but it is unfortunately broken in boost versions up to at
    // least 1.59.
@@ -278,36 +303,18 @@ public:
        using futurator = futurize<std::result_of_t<Func()>>;

        auto blocked_at = do_for_each_parent(this, [] (auto rg) {
-            return (rg->_blocked_requests.empty() && rg->execution_permitted()) ? stop_iteration::no : stop_iteration::yes;
+            return (rg->_blocked_requests.empty() && !rg->under_pressure()) ? stop_iteration::no : stop_iteration::yes;
        });

        if (!blocked_at) {
            return futurator::apply(func);
        }
-        subscribe_for_ancestor_available_memory_notification(blocked_at);

        auto fn = std::make_unique<concrete_allocating_function<Func>>(std::forward<Func>(func));
        auto fut = fn->get_future();
        _blocked_requests.push_back(std::move(fn));
        ++_blocked_requests_counter;

-        // This is called here, and not at update(), for two reasons: the first, is that things that
-        // are done during the free() path should be done carefuly, in the sense that they can
-        // trigger another update call and put us in a loop. Not to mention we would like to keep
-        // those from having exceptions. We solve that for release_requests by using later(), but in
-        // here we can do away with that need altogether.
-        //
-        // Second and most important, until we actually block a request, the pressure condition may
-        // very well be transient. There are opportunities for compactions, the condition can go
-        // away on its own, etc.
-        //
-        // The reason we check execution permitted(), is that we'll still block requests if we have
-        // free memory but existing requests in the queue. That is so we can keep our FIFO ordering
-        // guarantee. So we need to distinguish here the case in which we're blocking merely to
-        // serialize requests, so that the caller does not evict more than it should.
-        if (!blocked_at->execution_permitted()) {
-            blocked_at->_reclaimer.notify_pressure();
-        }
        return fut;
    }

@@ -317,9 +324,11 @@ public:
    region* get_largest_region();

    // Shutdown is mandatory for every user who has set a threshold
+    // Can be called at most once.
    future<> shutdown() {
        _shutdown_requested = true;
-        return _asynchronous_gate.close();
+        _relief.signal();
+        return std::move(_releaser);
    }

    size_t blocked_requests() {
@@ -330,43 +339,9 @@ public:
        return _blocked_requests_counter;
    }
 private:
-    // Make sure we get a notification and can call release_requests when one of our ancestors that
-    // used to block us is no longer under memory pressure.
-    void subscribe_for_ancestor_available_memory_notification(region_group *ancestor) {
-        if ((this == ancestor) || (_waiting_on_ancestor)) {
-            return; // already subscribed, or no need to
-        }
-
-        _waiting_on_ancestor = ancestor;
-
-        with_gate(_asynchronous_gate, [this] {
-            // We reevaluate _waiting_on_ancestor here so we make sure there is no deferring point
-            // between determining the ancestor and registering with it for a notification. We start
-            // with _waiting_on_ancestor set to the initial value, and after we are notified, we
-            // will set _waiting_on_ancestor to nullptr to force this lambda to reevaluate it.
-            auto evaluate_ancestor_and_stop = [this] {
-                if (!_waiting_on_ancestor) {
-                    auto new_blocking_point = do_for_each_parent(this, [] (auto rg) {
-                        return (rg->execution_permitted()) ? stop_iteration::no : stop_iteration::yes;
-                    });
-                    if (!new_blocking_point) {
-                        release_requests();
-                    }
-                    _waiting_on_ancestor = (new_blocking_point == this) ? nullptr : new_blocking_point;
-                }
-                return _waiting_on_ancestor == nullptr;
-            };
-
-            return do_until(evaluate_ancestor_and_stop, [this] {
-                if (!_waiting_on_ancestor->_descendant_blocked_requests) {
-                    _waiting_on_ancestor->_descendant_blocked_requests = shared_promise<>();
-                }
-                return _waiting_on_ancestor->_descendant_blocked_requests->get_shared_future().then([this] {
-                    _waiting_on_ancestor = nullptr;
-                });
-            });
-        });
-    }
+    // Returns true if and only if constraints of this group are not violated.
+    // That's taking into account any constraints imposed by enclosing (parent) groups.
+    bool execution_permitted() noexcept;

    // Executes the function func for each region_group upwards in the hierarchy, starting with the
    // parameter node. The function func may return stop_iteration::no, in which case it proceeds to
@@ -386,11 +361,10 @@ private:
        }
        return nullptr;
    }
-    inline bool execution_permitted() const {
-        return _total_memory <= _reclaimer.throttle_threshold();
-    }

-    void release_requests() noexcept;
+    inline bool under_pressure() const {
+        return _reclaimer.under_pressure();
+    }

    uint64_t top_region_evictable_space() const;