release: prepare for 1.5.0

tests: commitlog: Fix assumption about write visibility
The test assumed that mutations added to the commitlog are visible to reads as soon as a new segment is opened. That's not true because buffers are written back in the background, and new segment may be active while the previous one is still being written or not yet synced. Fix the test so that it expectes that the number of mutations read this way is <= the number of mutations read, and that after all segments are synced, the number of mutations read is equal. Message-Id: <1481630481-19395-1-git-send-email-tgrabiec@scylladb.com> (cherry picked from commit fe6a70dba1)
2016-12-21 12:12:11 +02:00 · 2016-12-20 20:08:48 +01:00 · 2016-12-19 15:26:35 +01:00 · 2016-12-18 11:14:09 +02:00 · 2016-12-16 19:48:08 +01:00 · 2016-12-16 10:56:34 -05:00
59 changed files with 1152 additions and 522 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=1.5.0

 if test -f version
 then
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -777,7 +777,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/read/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/read/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
@@ -792,7 +792,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/range/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/range/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
@@ -942,7 +942,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/write/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/write/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -194,7 +194,7 @@ void set_cache_service(http_context& ctx, routes& r) {
    });

    cs::get_row_capacity.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t(0), [](const column_family& cf) {
            return cf.get_row_cache().get_cache_tracker().region().occupancy().used_space();
        }, std::plus<uint64_t>());
    });
--- a/auth/data_resource.cc
+++ b/auth/data_resource.cc
@@ -47,11 +47,8 @@
 const sstring auth::data_resource::ROOT_NAME("data");

 auth::data_resource::data_resource(level l, const sstring& ks, const sstring& cf)
-    : _ks(ks), _cf(cf)
+    : _level(l), _ks(ks), _cf(cf)
 {
-    if (l != get_level()) {
-        throw std::invalid_argument("level/keyspace/column mismatch");
-    }
 }

 auth::data_resource::data_resource()
@@ -67,14 +64,7 @@ auth::data_resource::data_resource(const sstring& ks, const sstring& cf)
 {}

 auth::data_resource::level auth::data_resource::get_level() const {
-    if (!_cf.empty()) {
-        assert(!_ks.empty());
-        return level::COLUMN_FAMILY;
-    }
-    if (!_ks.empty()) {
-        return level::KEYSPACE;
-    }
-    return level::ROOT;
+    return _level;
 }

 auth::data_resource auth::data_resource::from_name(
--- a/auth/data_resource.hh
+++ b/auth/data_resource.hh
@@ -56,6 +56,7 @@ private:

    static const sstring ROOT_NAME;

+    level _level;
    sstring _ks;
    sstring _cf;

--- a/auth/permission.cc
+++ b/auth/permission.cc
@@ -40,6 +40,7 @@
 */

 #include <unordered_map>
+#include <boost/algorithm/string.hpp>
 #include "permission.hh"

 const auth::permission_set auth::permissions::ALL_DATA =
@@ -75,7 +76,9 @@ const sstring& auth::permissions::to_string(permission p) {
 }

 auth::permission auth::permissions::from_string(const sstring& s) {
-    return permission_names.at(s);
+    sstring upper(s);
+    boost::to_upper(upper);
+    return permission_names.at(upper);
 }

 std::unordered_set<sstring> auth::permissions::to_strings(const permission_set& set) {
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -409,29 +409,6 @@ partitioner: org.apache.cassandra.dht.Murmur3Partitioner
 # the smaller of 1/4 of heap or 512MB.
 # file_cache_size_in_mb: 512

-# Total permitted memory to use for memtables. Scylla will stop 
-# accepting writes when the limit is exceeded until a flush completes,
-# and will trigger a flush based on memtable_cleanup_threshold
-# If omitted, Scylla will set both to 1/4 the size of the heap.
-# memtable_heap_space_in_mb: 2048
-# memtable_offheap_space_in_mb: 2048
-
-# Ratio of occupied non-flushing memtable size to total permitted size
-# that will trigger a flush of the largest memtable.  Lager mct will
-# mean larger flushes and hence less compaction, but also less concurrent
-# flush activity which can make it difficult to keep your disks fed
-# under heavy write load.
-#
-# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1)
-# memtable_cleanup_threshold: 0.11
-
-# Specify the way Scylla allocates and manages memtable memory.
-# Options are:
-#   heap_buffers:    on heap nio buffers
-#   offheap_buffers: off heap (direct) nio buffers
-#   offheap_objects: native memory, eliminating nio buffer heap overhead
-# memtable_allocation_type: heap_buffers
-
 # Total space to use for commitlogs.
 #
 # If space gets above this value (it will round up to the next nearest
@@ -443,17 +420,6 @@ partitioner: org.apache.cassandra.dht.Murmur3Partitioner
 # available for Scylla.
 commitlog_total_space_in_mb: -1

-# This sets the amount of memtable flush writer threads.  These will
-# be blocked by disk io, and each one will hold a memtable in memory
-# while blocked. 
-#
-# memtable_flush_writers defaults to the smaller of (number of disks,
-# number of cores), with a minimum of 2 and a maximum of 8.
-# 
-# If your data directories are backed by SSD, you should increase this
-# to the number of cores.
-#memtable_flush_writers: 8
-
 # A fixed memory pool size in MB for for SSTable index summaries. If left
 # empty, this will default to 5% of the heap size. If the memory usage of
 # all index summaries exceeds this limit, SSTables with low read rates will
--- a/configure.py
+++ b/configure.py
@@ -221,6 +221,7 @@ scylla_tests = [
    'tests/database_test',
    'tests/nonwrapping_range_test',
    'tests/input_stream_test',
+    'tests/sstable_atomic_deletion_test',
 ]

 apps = [
@@ -307,6 +308,7 @@ scylla_core = (['database.cc',
                 'sstables/compaction.cc',
                 'sstables/compaction_strategy.cc',
                 'sstables/compaction_manager.cc',
+                 'sstables/atomic_deletion.cc',
                 'transport/event.cc',
                 'transport/event_notifier.cc',
                 'transport/server.cc',
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -232,7 +232,7 @@ uint32_t selection::add_column_for_ordering(const column_definition& c) {
            raw_selector::to_selectables(raw_selectors, schema), db, schema, defs);

    auto metadata = collect_metadata(schema, raw_selectors, *factories);
-    if (processes_selection(raw_selectors)) {
+    if (processes_selection(raw_selectors) || raw_selectors.size() != defs.size()) {
        return ::make_shared<selection_with_processing>(schema, std::move(defs), std::move(metadata), std::move(factories));
    } else {
        return ::make_shared<simple_selection>(schema, std::move(defs), std::move(metadata), false);
--- a/database.cc
+++ b/database.cc
@@ -91,34 +91,33 @@ public:

 // Used for tests where the CF exists without a database object. We need to pass a valid
 // dirty_memory manager in that case.
-thread_local memtable_dirty_memory_manager default_dirty_memory_manager;
+thread_local dirty_memory_manager default_dirty_memory_manager;

 lw_shared_ptr<memtable_list>
 column_family::make_memory_only_memtable_list() {
-    auto seal = [this] (memtable_list::flush_behavior ignored) { return make_ready_future<>(); };
    auto get_schema = [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.max_memtable_size, _config.dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(get_schema), _config.dirty_memory_manager);
 }

 lw_shared_ptr<memtable_list>
 column_family::make_memtable_list() {
    auto seal = [this] (memtable_list::flush_behavior behavior) { return seal_active_memtable(behavior); };
    auto get_schema = [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.max_memtable_size, _config.dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.dirty_memory_manager);
 }

 lw_shared_ptr<memtable_list>
 column_family::make_streaming_memtable_list() {
    auto seal = [this] (memtable_list::flush_behavior behavior) { return seal_active_streaming_memtable(behavior); };
    auto get_schema =  [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.max_streaming_memtable_size, _config.streaming_dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager);
 }

 lw_shared_ptr<memtable_list>
 column_family::make_streaming_memtable_big_list(streaming_memtable_big& smb) {
    auto seal = [this, &smb] (memtable_list::flush_behavior) { return seal_active_streaming_memtable_big(smb); };
    auto get_schema =  [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.max_streaming_memtable_size, _config.streaming_dirty_memory_manager);
+    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager);
 }

 column_family::column_family(schema_ptr schema, config config, db::commitlog* cl, compaction_manager& compaction_manager)
@@ -912,10 +911,6 @@ column_family::seal_active_streaming_memtable_delayed() {
        return make_ready_future<>();
    }

-    if (_streaming_memtables->should_flush()) {
-        return seal_active_streaming_memtable_immediate();
-    }
-
    if (!_delayed_streaming_flush.armed()) {
            // We don't want to wait for too long, because the incoming mutations will not be available
            // until we flush them to SSTables. On top of that, if the sender ran out of messages, it won't
@@ -946,8 +941,7 @@ column_family::seal_active_streaming_memtable_immediate() {
        auto current_waiters = std::exchange(_waiting_streaming_flushes, shared_promise<>());
        auto f = current_waiters.get_shared_future(); // for this seal

-        _config.streaming_dirty_memory_manager->serialize_flush([this, old] {
-          return with_lock(_sstables_lock.for_read(), [this, old] {
+        with_lock(_sstables_lock.for_read(), [this, old] {
            auto newtab = make_lw_shared<sstables::sstable>(_schema,
                _config.datadir, calculate_generation_for_new_table(),
                sstables::sstable::version_types::ka,
@@ -980,7 +974,6 @@ column_family::seal_active_streaming_memtable_immediate() {
            });
            // We will also not have any retry logic. If we fail here, we'll fail the streaming and let
            // the upper layers know. They can then apply any logic they want here.
-          });
        }).then_wrapped([this, current_waiters = std::move(current_waiters)] (future <> f) mutable {
            if (f.failed()) {
                current_waiters.set_exception(f.get_exception());
@@ -1044,12 +1037,10 @@ column_family::seal_active_memtable(memtable_list::flush_behavior ignored) {
      _config.cf_stats->pending_memtables_flushes_count++;
      _config.cf_stats->pending_memtables_flushes_bytes += memtable_size;

-      return _config.dirty_memory_manager->serialize_flush([this, old] {
-        return repeat([this, old] {
-            return with_lock(_sstables_lock.for_read(), [this, old] {
-                _flush_queue->check_open_gate();
-                return try_flush_memtable_to_sstable(old);
-            });
+      return repeat([this, old] {
+        return with_lock(_sstables_lock.for_read(), [this, old] {
+            _flush_queue->check_open_gate();
+            return try_flush_memtable_to_sstable(old);
        });
      }).then([this, memtable_size] {
        _config.cf_stats->pending_memtables_flushes_count--;
@@ -1091,6 +1082,24 @@ column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
        return newtab->open_data();
    }).then_wrapped([this, old, newtab] (future<> ret) {
        dblog.debug("Flushing to {} done", newtab->get_filename());
+        // Could pass the iterator to the seal functions, and avoid the need to search the
+        // unordered_map here. But this is supposed to be cheap and it is a lot less clutter in the
+        // method signatures. Also makes it optional and streaming memtables don't have to do it.
+        // Note that the number of entries in this hash is limited by the background flushes
+        // semaphore, so it'll always be small.
+        //
+        // In terms of releasing dirty memory, this is almost as far as we should go. We could do
+        // this right before updating the cache, but from this point on to update_cache we have no
+        // deferring points, so that's fine. We do it in here because if we fail this write it will
+        // try the write again and that will create a new flush reader that will decrease dirty
+        // memory again. So we need to get rid of the charges here anyway for correctness.
+        //
+        // After the cache starts to be updated the region in transferred over. We kind of assume
+        // there will be no deferring point between this and update cache transferring ownership.
+        // It's not that bad if it is so we wouldn't really protect against it, but without a
+        // deferring point we can guarantee that no request will see a spike in dirty memory between
+        // the release of our memory and the execution of a request.
+        dirty_memory_manager::from_region_group(old->region_group()).remove_from_flush_manager(&(old->region()));
        try {
            ret.get();

@@ -1131,15 +1140,15 @@ column_family::start() {

 future<>
 column_family::stop() {
-    _memtables->seal_active_memtable(memtable_list::flush_behavior::immediate);
-    _streaming_memtables->seal_active_memtable(memtable_list::flush_behavior::immediate);
-    return _compaction_manager.remove(this).then([this] {
-        // Nest, instead of using when_all, so we don't lose any exceptions.
-        return _flush_queue->close().then([this] {
-            return _streaming_flush_gate.close();
+    return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
+        return _compaction_manager.remove(this).then([this] {
+            // Nest, instead of using when_all, so we don't lose any exceptions.
+            return _flush_queue->close().then([this] {
+                return _streaming_flush_gate.close();
+            });
+        }).then([this] {
+            return _sstable_deletion_gate.close();
        });
-    }).then([this] {
-        return _sstable_deletion_gate.close();
    });
 }

@@ -1304,7 +1313,17 @@ column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>&
    // Second, delete the old sstables.  This is done in the background, so we can
    // consider this compaction completed.
    seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
-        return sstables::delete_atomically(sstables_to_remove).then([this, sstables_to_remove] {
+        return sstables::delete_atomically(sstables_to_remove).then_wrapped([this, sstables_to_remove] (future<> f) {
+            std::exception_ptr eptr;
+            try {
+                f.get();
+            } catch(...) {
+                eptr = std::current_exception();
+            }
+
+            // unconditionally remove compacted sstables from _sstables_compacted_but_not_deleted,
+            // or they could stay forever in the set, resulting in deleted files remaining
+            // opened and disk space not being released until shutdown.
            std::unordered_set<sstables::shared_sstable> s(
                   sstables_to_remove.begin(), sstables_to_remove.end());
            auto e = boost::range::remove_if(_sstables_compacted_but_not_deleted, [&] (sstables::shared_sstable sst) -> bool {
@@ -1312,6 +1331,11 @@ column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>&
            });
            _sstables_compacted_but_not_deleted.erase(e, _sstables_compacted_but_not_deleted.end());
            rebuild_statistics();
+
+            if (eptr) {
+                return make_exception_future<>(eptr);
+            }
+            return make_ready_future<>();
        }).handle_exception([] (std::exception_ptr e) {
            try {
                std::rethrow_exception(e);
@@ -1626,41 +1650,12 @@ database::database() : database(db::config())
 {}

 database::database(const db::config& cfg)
-    : _cfg(std::make_unique<db::config>(cfg))
-    , _memtable_total_space([this] {
-        _stats = make_lw_shared<db_stats>();
-
-        auto memtable_total_space = size_t(_cfg->memtable_total_space_in_mb()) << 20;
-        if (!memtable_total_space) {
-            return memory::stats().total_memory() / 2;
-        }
-        return memtable_total_space;
-    }())
-    , _streaming_memtable_total_space(_memtable_total_space / 4)
-    // Allow system tables a pool of 10 MB extra memory to write over the threshold. Under normal
-    // circumnstances it won't matter, but when we throttle, some system requests will be able to
-    // keep being serviced even if user requests are not.
-    //
-    // Note that even if we didn't allow extra memory, we would still want to keep system requests
-    // in a different region group. This is because throttled requests are serviced in FIFO order,
-    // and we don't want system requests to be waiting for a long time behind user requests.
-    , _system_dirty_memory_manager(*this, _memtable_total_space + (10 << 20))
-    // The total space that can be used by memtables is _memtable_total_space, but we will only
-    // allow the region_group to grow to half of that. This is because of virtual_dirty: memtables
-    // can take a long time to flush, and if we are using the maximum amount of memory possible,
-    // then requests will block until we finish flushing at least one memtable.
-    //
-    // We can free memory until the whole memtable is flushed because we need to keep it in memory
-    // until the end, but we can fake freeing memory. When we are done with an element of the
-    // memtable, we will update the region group pretending memory just went down by that amount.
-    //
-    // Because the amount of memory that we pretend to free should be close enough to the actual
-    // memory used by the memtables, that effectively creates two sub-regions inside the dirty
-    // region group, of equal size. In the worst case, we will have _memtable_total_space dirty
-    // bytes used, and half of that already virtually freed.
-    , _dirty_memory_manager(*this, &_system_dirty_memory_manager, _memtable_total_space / 2)
-    // The same goes for streaming in respect to virtual dirty.
-    , _streaming_dirty_memory_manager(*this, &_dirty_memory_manager, _streaming_memtable_total_space / 2)
+    : _stats(make_lw_shared<db_stats>())
+    , _cfg(std::make_unique<db::config>(cfg))
+    // Allow system tables a pool of 10 MB memory to write, but never block on other regions.
+    , _system_dirty_memory_manager(*this, 10 << 20)
+    , _dirty_memory_manager(*this, memory::stats().total_memory() * 0.45)
+    , _streaming_dirty_memory_manager(*this, memory::stats().total_memory() * 0.10)
    , _version(empty_version)
    , _enable_incremental_backups(cfg.incremental_backups())
 {
@@ -2167,8 +2162,6 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
    cfg.enable_disk_writes = _config.enable_disk_writes;
    cfg.enable_commitlog = _config.enable_commitlog;
    cfg.enable_cache = _config.enable_cache;
-    cfg.max_memtable_size = _config.max_memtable_size;
-    cfg.max_streaming_memtable_size = _config.max_streaming_memtable_size;
    cfg.dirty_memory_manager = _config.dirty_memory_manager;
    cfg.streaming_dirty_memory_manager = _config.streaming_dirty_memory_manager;
    cfg.read_concurrency_config = _config.read_concurrency_config;
@@ -2468,7 +2461,6 @@ column_family::apply(const mutation& m, const db::replay_position& rp) {
    utils::latency_counter lc;
    _stats.writes.set_latency(lc);
    _memtables->active_memtable().apply(m, rp);
-    _memtables->seal_on_overflow();
    _stats.writes.mark(lc);
    if (lc.is_start()) {
        _stats.estimated_write.add(lc.latency(), _stats.writes.hist.count);
@@ -2481,7 +2473,6 @@ column_family::apply(const frozen_mutation& m, const schema_ptr& m_schema, const
    _stats.writes.set_latency(lc);
    check_valid_rp(rp);
    _memtables->active_memtable().apply(m, m_schema, rp);
-    _memtables->seal_on_overflow();
    _stats.writes.mark(lc);
    if (lc.is_start()) {
        _stats.estimated_write.add(lc.latency(), _stats.writes.hist.count);
@@ -2494,7 +2485,6 @@ void column_family::apply_streaming_mutation(schema_ptr m_schema, utils::UUID pl
        return;
    }
    _streaming_memtables->active_memtable().apply(m, m_schema);
-    _streaming_memtables->seal_on_overflow();
 }

 void column_family::apply_streaming_big_mutation(schema_ptr m_schema, utils::UUID plan_id, const frozen_mutation& m) {
@@ -2505,7 +2495,6 @@ void column_family::apply_streaming_big_mutation(schema_ptr m_schema, utils::UUI
    }
    auto entry = it->second;
    entry->memtables->active_memtable().apply(m, m_schema);
-    entry->memtables->seal_on_overflow();
 }

 void
@@ -2517,51 +2506,107 @@ column_family::check_valid_rp(const db::replay_position& rp) const {

 future<> dirty_memory_manager::shutdown() {
    _db_shutdown_requested = true;
-    return _waiting_flush_gate.close().then([this] {
+    _should_flush.signal();
+    return std::move(_waiting_flush).then([this] {
        return _region_group.shutdown();
    });
 }

-void dirty_memory_manager::maybe_do_active_flush() {
-    if (!_db || !under_pressure() || _db_shutdown_requested) {
-        return;
+future<> memtable_list::request_flush() {
+    if (!may_flush()) {
+        return make_ready_future<>();
+    } else if (!_flush_coalescing) {
+        _flush_coalescing = shared_promise<>();
+        return _dirty_memory_manager->get_flush_permit().then([this] (auto permit) {
+            auto current_flush = std::move(*_flush_coalescing);
+            _flush_coalescing = {};
+            return _dirty_memory_manager->flush_one(*this, std::move(permit)).then_wrapped([this, current_flush = std::move(current_flush)] (auto f) mutable {
+                if (f.failed()) {
+                    current_flush.set_exception(f.get_exception());
+                } else {
+                    current_flush.set_value();
+                }
+            });
+        });
+    } else {
+        return _flush_coalescing->get_shared_future();
    }
-
-    // Flush already ongoing. We don't need to initiate an active flush at this moment.
-    if (_flush_serializer.current() == 0) {
-        return;
-    }
-
-    // There are many criteria that can be used to select what is the best memtable to
-    // flush. Most of the time we want some coordination with the commitlog to allow us to
-    // release commitlog segments as early as we can.
-    //
-    // But during pressure condition, we'll just pick the CF that holds the largest
-    // memtable. The advantage of doing this is that this is objectively the one that will
-    // release the biggest amount of memory and is less likely to be generating tiny
-    // SSTables. The disadvantage is that right now, because we only release memory when the
-    // SSTable is fully written, that may take a bit of time to happen.
-    //
-    // However, since we'll very soon have a mechanism in place to account for the memory
-    // that was already written in one form or another, that disadvantage is mitigated.
-    memtable& biggest_memtable = memtable::from_region(*_region_group.get_largest_region());
-    auto& biggest_cf = _db->find_column_family(biggest_memtable.schema());
-    memtable_list& mtlist = get_memtable_list(biggest_cf);
-    // Please note that this will eventually take the semaphore and prevent two concurrent flushes.
-    // We don't need any other extra protection.
-    mtlist.seal_active_memtable(memtable_list::flush_behavior::immediate);
 }

-memtable_list& memtable_dirty_memory_manager::get_memtable_list(column_family& cf) {
-    return *(cf._memtables);
+future<> dirty_memory_manager::flush_one(memtable_list& mtlist, semaphore_units<> permit) {
+    if (mtlist.back()->empty()) {
+        return make_ready_future<>();
+    }
+
+    auto* region = &(mtlist.back()->region());
+    auto schema = mtlist.back()->schema();
+
+    add_to_flush_manager(region, std::move(permit));
+    return get_units(_background_work_flush_serializer, 1).then([this, &mtlist, region, schema] (auto permit) mutable {
+        return mtlist.seal_active_memtable(memtable_list::flush_behavior::immediate).then_wrapped([this, region, schema, permit = std::move(permit)] (auto f) {
+            // There are two cases in which we may still need to remove the permits from here.
+            //
+            // 1) Some exception happenend, and we can't know at which point. It could be that because
+            //    of that, the permits are still dangling. We have to remove it.
+            // 2) If we are using a memory-only Column Family. That will never create a memtable
+            //    flush object, and we'll never get rid of the permits. So we have to remove it
+            //    here.
+            this->remove_from_flush_manager(region);
+            if (f.failed()) {
+                dblog.error("Failed to flush memtable, {}:{}", schema->ks_name(), schema->cf_name());
+            }
+            return std::move(f);
+        });
+    });
 }

-memtable_list& streaming_dirty_memory_manager::get_memtable_list(column_family& cf) {
-    return *(cf._streaming_memtables);
+future<> dirty_memory_manager::flush_when_needed() {
+    if (!_db) {
+        return make_ready_future<>();
+    }
+    // If there are explicit flushes requested, we must wait for them to finish before we stop.
+    return do_until([this] { return _db_shutdown_requested && !_flush_serializer.waiters(); }, [this] {
+        auto has_work = [this] { return _flush_serializer.waiters() || over_soft_limit() || _db_shutdown_requested; };
+        return _should_flush.wait(std::move(has_work)).then([this] {
+            return get_flush_permit().then([this] (auto permit) {
+                // We give priority to explicit flushes. They are mainly user-initiated flushes,
+                // flushes coming from a DROP statement, or commitlog flushes.
+                if (_flush_serializer.waiters()) {
+                    return make_ready_future<>();
+                }
+                // condition abated while we waited for the semaphore
+                if (!this->over_soft_limit() || _db_shutdown_requested) {
+                    return make_ready_future<>();
+                }
+                // There are many criteria that can be used to select what is the best memtable to
+                // flush. Most of the time we want some coordination with the commitlog to allow us to
+                // release commitlog segments as early as we can.
+                //
+                // But during pressure condition, we'll just pick the CF that holds the largest
+                // memtable. The advantage of doing this is that this is objectively the one that will
+                // release the biggest amount of memory and is less likely to be generating tiny
+                // SSTables.
+                memtable& candidate_memtable = memtable::from_region(*(this->_region_group.get_largest_region()));
+                dirty_memory_manager* candidate_dirty_manager = &(dirty_memory_manager::from_region_group(candidate_memtable.region_group()));
+                // Do not wait. The semaphore will protect us against a concurrent flush. But we
+                // want to start a new one as soon as the permits are destroyed and the semaphore is
+                // made ready again, not when we are done with the current one.
+                candidate_dirty_manager->flush_one(*(candidate_memtable.get_memtable_list()), std::move(permit));
+                return make_ready_future<>();
+            });
+        });
+    }).finally([this] {
+        // We'll try to acquire the permit here to make sure we only really stop when there are no
+        // in-flight flushes. Our stop condition checks for the presence of waiters, but it could be
+        // that we have no waiters, but a flush still in flight. We wait for all background work to
+        // stop. When that stops, we know that the foreground work in the _flush_serializer has
+        // stopped as well.
+        return get_units(_background_work_flush_serializer, _max_background_work);
+    });
 }

 void dirty_memory_manager::start_reclaiming() {
-    maybe_do_active_flush();
+    _should_flush.signal();
 }

 future<> database::apply_in_memory(const frozen_mutation& m, schema_ptr m_schema, db::replay_position rp) {
@@ -2637,10 +2682,6 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        cfg.enable_disk_reads = true; // we allways read from disk
        cfg.enable_commitlog = ksm.durable_writes() && _cfg->enable_commitlog() && !_cfg->enable_in_memory_data_store();
        cfg.enable_cache = _cfg->enable_cache();
-        cfg.max_memtable_size = _memtable_total_space * _cfg->memtable_cleanup_threshold();
-        // We should guarantee that at least two memtable are available, otherwise after flush, adding another memtable would
-        // easily take us into throttling until the first one is flushed.
-        cfg.max_streaming_memtable_size = std::min(cfg.max_memtable_size, _streaming_memtable_total_space / 2);

    } else {
        cfg.datadir = "";
@@ -2648,9 +2689,6 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        cfg.enable_disk_reads = false;
        cfg.enable_commitlog = false;
        cfg.enable_cache = false;
-        cfg.max_memtable_size = std::numeric_limits<size_t>::max();
-        // All writes should go to the main memtable list if we're not durable
-        cfg.max_streaming_memtable_size = 0;
    }
    cfg.dirty_memory_manager = &_dirty_memory_manager;
    cfg.streaming_dirty_memory_manager = &_streaming_dirty_memory_manager;
@@ -3097,21 +3135,17 @@ future<std::unordered_map<sstring, column_family::snapshot_details>> column_fami
 future<> column_family::flush() {
    _stats.pending_flushes++;

-    auto fut = _memtables->seal_active_memtable(memtable_list::flush_behavior::immediate);
-    // this rp is either:
-    // a.) Done - no-op
-    // b.) Ours
-    // c.) The last active flush not finished. If our latest memtable is
-    //     empty it still makes sense for this api call to wait for this.
-    auto high_rp = _highest_flushed_rp;
-
-    return fut.finally([this, high_rp] {
+    // highest_flushed_rp is only updated when we flush. If the memtable is currently alive, then
+    // the most up2date replay position is the one that's in there now. Otherwise, if the memtable
+    // hasn't received any writes yet, that's the one from the last flush we made.
+    auto desired_rp = _memtables->back()->empty() ? _highest_flushed_rp : _memtables->back()->replay_position();
+    return _memtables->request_flush().finally([this, desired_rp] {
        _stats.pending_flushes--;
        // In origin memtable_switch_count is incremented inside
        // ColumnFamilyMeetrics Flush.run
        _stats.memtable_switch_count++;
        // wait for all up until us.
-        return _flush_queue->wait_for_pending(high_rp);
+        return _flush_queue->wait_for_pending(desired_rp);
    });
 }

@@ -3128,7 +3162,7 @@ future<> column_family::flush(const db::replay_position& pos) {
    // We ignore this for now and just say that if we're asked for
    // a CF and it exists, we pretty much have to have data that needs
    // flushing. Let's do it.
-    return _memtables->seal_active_memtable(memtable_list::flush_behavior::immediate);
+    return _memtables->request_flush();
 }

 // FIXME: We can do much better than this in terms of cache management. Right
@@ -3169,7 +3203,7 @@ future<> column_family::flush_streaming_big_mutations(utils::UUID plan_id) {
    }
    auto entry = it->second;
    _streaming_memtables_big.erase(it);
-    return entry->memtables->seal_active_memtable(memtable_list::flush_behavior::immediate).then([entry] {
+    return entry->memtables->request_flush().then([entry] {
        return entry->flush_in_progress.close();
    }).then([this, entry] {
        return parallel_for_each(entry->sstables, [this] (auto& sst) {
--- a/database.hh
+++ b/database.hh
@@ -119,28 +119,89 @@ class dirty_memory_manager: public logalloc::region_group_reclaimer {
    // throttled for a long time. Even when we have virtual dirty, that only provides a rough
    // estimate, and we can't release requests that early.
    semaphore _flush_serializer;
+    // We will accept a new flush before another one ends, once it is done with the data write.
+    // That is so we can keep the disk always busy. But there is still some background work that is
+    // left to be done. Mostly, update the caches and seal the auxiliary components of the SSTable.
+    // This semaphore will cap the amount of background work that we have. Note that we're not
+    // overly concerned about memtable memory, because dirty memory will put a limit to that. This
+    // is mostly about dangling continuations. So that doesn't have to be a small number.
+    static constexpr unsigned _max_background_work = 20;
+    semaphore _background_work_flush_serializer = { _max_background_work };
+    condition_variable _should_flush;
    int64_t _dirty_bytes_released_pre_accounted = 0;

-    seastar::gate _waiting_flush_gate;
-    std::vector<shared_memtable> _pending_flushes;
-    void maybe_do_active_flush();
-protected:
-    virtual memtable_list& get_memtable_list(column_family& cf) = 0;
+    future<> flush_when_needed();
+    // We need to start a flush before the current one finishes, otherwise
+    // we'll have a period without significant disk activity when the current
+    // SSTable is being sealed, the caches are being updated, etc. To do that
+    // we need to keep track of who is it that we are flushing this memory from.
+    struct flush_token {
+        dirty_memory_manager* _dirty_memory_manager;
+        size_t _freed_memory = 0;
+        semaphore_units<> _sem;
+    public:
+        flush_token(dirty_memory_manager *dm, semaphore_units<>&& s) : _dirty_memory_manager(dm), _sem(std::move(s)) {}
+        void mark_end_flush(size_t freed) {
+            auto destroy = std::move(_sem);
+            _freed_memory = freed;
+        }
+        ~flush_token() {
+            _dirty_memory_manager->_region_group.update(_freed_memory);
+            _dirty_memory_manager->_dirty_bytes_released_pre_accounted -= _freed_memory;
+        }
+    };
+    friend class flush_token;
+    std::unordered_map<const logalloc::region*, flush_token> _flush_manager;
+
+    future<> _waiting_flush;
    virtual void start_reclaiming() override;
 public:
    future<> shutdown();

-    dirty_memory_manager(database* db, size_t threshold)
-                                           : logalloc::region_group_reclaimer(threshold)
-                                           , _db(db)
-                                           , _region_group(*this)
-                                           , _flush_serializer(1) {}
+    // Limits and pressure conditions:
+    // ===============================
+    //
+    // Virtual Dirty
+    // -------------
+    // We can't free memory until the whole memtable is flushed because we need to keep it in memory
+    // until the end, but we can fake freeing memory. When we are done with an element of the
+    // memtable, we will update the region group pretending memory just went down by that amount.
+    //
+    // Because the amount of memory that we pretend to free should be close enough to the actual
+    // memory used by the memtables, that effectively creates two sub-regions inside the dirty
+    // region group, of equal size. In the worst case, we will have <memtable_total_space> dirty
+    // bytes used, and half of that already virtually freed.
+    //
+    // Hard Limit
+    // ----------
+    // The total space that can be used by memtables in each group is defined by the threshold, but
+    // we will only allow the region_group to grow to half of that. This is because of virtual_dirty
+    // as explained above. Because virtual dirty is implemented by reducing the usage in the
+    // region_group directly on partition written, we want to throttle every time half of the memory
+    // as seen by the region_group. To achieve that we need to set the hard limit (first parameter
+    // of the region_group_reclaimer) to 1/2 of the user-supplied threshold
+    //
+    // Soft Limit
+    // ----------
+    // When the soft limit is hit, no throttle happens. The soft limit exists because we don't want
+    // to start flushing only when the limit is hit, but a bit earlier instead. If we were to start
+    // flushing only when the hard limit is hit, workloads in which the disk is fast enough to cope
+    // would see latency added to some requests unnecessarily.
+    //
+    // We then set the soft limit to 80 % of the virtual dirty hard limit, which is equal to 40 % of
+    // the user-supplied threshold.
+    dirty_memory_manager(database& db, size_t threshold)
+        : logalloc::region_group_reclaimer(threshold / 2, threshold * 0.40)
+        , _db(&db)
+        , _region_group(*this)
+        , _flush_serializer(1)
+        , _waiting_flush(flush_when_needed()) {}

-    dirty_memory_manager(database* db, dirty_memory_manager *parent, size_t threshold)
-                                                                         : logalloc::region_group_reclaimer(threshold)
-                                                                         , _db(db)
-                                                                         , _region_group(&parent->_region_group, *this)
-                                                                         , _flush_serializer(1) {}
+    dirty_memory_manager() : logalloc::region_group_reclaimer()
+        , _db(nullptr)
+        , _region_group(*this)
+        , _flush_serializer(1)
+        , _waiting_flush(make_ready_future<>()) {}

    static dirty_memory_manager& from_region_group(logalloc::region_group *rg) {
        return *(boost::intrusive::get_parent_from_member(rg, &dirty_memory_manager::_region_group));
@@ -154,16 +215,44 @@ public:
        return _region_group;
    }

-    void revert_potentially_cleaned_up_memory(int64_t delta) {
-        _region_group.update(delta);
-        _dirty_bytes_released_pre_accounted -= delta;
+    void revert_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
+        // Flushed the current memtable. There is still some work to do, like finish sealing the
+        // SSTable and updating the cache, but we can already allow the next one to start.
+        //
+        // By erasing this memtable from the flush_manager we'll destroy the semaphore_units
+        // associated with this flush and will allow another one to start. We'll signal the
+        // condition variable to let them know we might be ready early.
+        auto it = _flush_manager.find(from);
+        if (it != _flush_manager.end()) {
+            it->second.mark_end_flush(delta);
+        }
    }

-    void account_potentially_cleaned_up_memory(int64_t delta) {
+    void account_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
        _region_group.update(-delta);
        _dirty_bytes_released_pre_accounted += delta;
    }

+    // This can be called multiple times during the lifetime of the region, and should always
+    // ultimately be called after the flush ends. However, some flushers may decide to call it
+    // earlier. For instance, the normal memtables sealing function will call this before updating
+    // the cache.
+    //
+    // Also, for sealing methods like the normal memtable sealing method - that may retry after a
+    // failed write, calling this method after the attempt is completed with success or failure is
+    // mandatory. That's because the new attempt will create a new flush reader for the same
+    // SSTable, so we need to make sure that we revert the old charges.
+    void remove_from_flush_manager(const logalloc::region *region) {
+        auto it = _flush_manager.find(region);
+        if (it != _flush_manager.end()) {
+            _flush_manager.erase(it);
+        }
+    }
+
+    void add_to_flush_manager(const logalloc::region *region, semaphore_units<>&& permit) {
+        _flush_manager.emplace(std::piecewise_construct, std::make_tuple(region), std::make_tuple(this, std::move(permit)));
+    }
+
    size_t real_dirty_memory() const {
        return _region_group.memory_used() + _dirty_bytes_released_pre_accounted;
    }
@@ -172,33 +261,14 @@ public:
        return _region_group.memory_used();
    }

-    template <typename Func>
-    future<> serialize_flush(Func&& func) {
-        return seastar::with_gate(_waiting_flush_gate,  [this, func] () mutable {
-            return with_semaphore(_flush_serializer, 1, func).finally([this] {
-                maybe_do_active_flush();
-            });
-        });
+    future<> flush_one(memtable_list& cf, semaphore_units<> permit);
+
+    future<semaphore_units<>> get_flush_permit() {
+        return get_units(_flush_serializer, 1);
    }
 };

-class streaming_dirty_memory_manager: public dirty_memory_manager {
-    virtual memtable_list& get_memtable_list(column_family& cf) override;
-public:
-    streaming_dirty_memory_manager(database& db, dirty_memory_manager *parent, size_t threshold) : dirty_memory_manager(&db, parent, threshold) {}
-};
-
-class memtable_dirty_memory_manager: public dirty_memory_manager {
-    virtual memtable_list& get_memtable_list(column_family& cf) override;
-public:
-    memtable_dirty_memory_manager(database& db, dirty_memory_manager* parent, size_t threshold) : dirty_memory_manager(&db, parent, threshold) {}
-    // This constructor will be called for the system tables (no parent). Its flushes are usually drive by us
-    // and not the user, and tend to be small in size. So we'll allow only two slots.
-    memtable_dirty_memory_manager(database& db, size_t threshold) : dirty_memory_manager(&db, threshold) {}
-    memtable_dirty_memory_manager() : dirty_memory_manager(nullptr, std::numeric_limits<size_t>::max()) {}
-};
-
-extern thread_local memtable_dirty_memory_manager default_dirty_memory_manager;
+extern thread_local dirty_memory_manager default_dirty_memory_manager;

 // We could just add all memtables, regardless of types, to a single list, and
 // then filter them out when we read them. Here's why I have chosen not to do
@@ -225,18 +295,29 @@ private:
    std::vector<shared_memtable> _memtables;
    std::function<future<> (flush_behavior)> _seal_fn;
    std::function<schema_ptr()> _current_schema;
-    size_t _max_memtable_size;
    dirty_memory_manager* _dirty_memory_manager;
+    std::experimental::optional<shared_promise<>> _flush_coalescing;
 public:
-    memtable_list(std::function<future<> (flush_behavior)> seal_fn, std::function<schema_ptr()> cs, size_t max_memtable_size, dirty_memory_manager* dirty_memory_manager)
+    memtable_list(std::function<future<> (flush_behavior)> seal_fn, std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
        : _memtables({})
        , _seal_fn(seal_fn)
        , _current_schema(cs)
-        , _max_memtable_size(max_memtable_size)
        , _dirty_memory_manager(dirty_memory_manager) {
        add_memtable();
    }

+    memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
+        : _memtables({})
+        , _seal_fn()
+        , _current_schema(cs)
+        , _dirty_memory_manager(dirty_memory_manager) {
+        add_memtable();
+    }
+
+    bool may_flush() const {
+        return bool(_seal_fn);
+    }
+
    shared_memtable back() {
        return _memtables.back();
    }
@@ -281,20 +362,17 @@ public:
        _memtables.emplace_back(new_memtable());
    }

-    bool should_flush() {
-        return active_memtable().occupancy().total_space() >= _max_memtable_size;
-    }
-
-    void seal_on_overflow() {
-        if (should_flush()) {
-            // FIXME: if sparse, do some in-memory compaction first
-            // FIXME: maybe merge with other in-memory memtables
-            seal_active_memtable(flush_behavior::immediate);
-        }
+    logalloc::region_group& region_group() {
+        return _dirty_memory_manager->region_group();
    }
+    // This is used for explicit flushes. Will queue the memtable for flushing and proceed when the
+    // dirty_memory_manager allows us to. We will not seal at this time since the flush itself
+    // wouldn't happen anyway. Keeping the memtable in memory will potentially increase the time it
+    // spends in memory allowing for more coalescing opportunities.
+    future<> request_flush();
 private:
    lw_shared_ptr<memtable> new_memtable() {
-        return make_lw_shared<memtable>(_current_schema(), &(_dirty_memory_manager->region_group()));
+        return make_lw_shared<memtable>(_current_schema(), this);
    }
 };

@@ -328,8 +406,6 @@ public:
        bool enable_cache = true;
        bool enable_commitlog = true;
        bool enable_incremental_backups = false;
-        size_t max_memtable_size = 5'000'000;
-        size_t max_streaming_memtable_size = 5'000'000;
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
        restricted_mutation_reader_config read_concurrency_config;
@@ -388,9 +464,6 @@ private:
    lw_shared_ptr<memtable_list> _streaming_memtables;
    utils::phased_barrier _streaming_flush_phaser;

-    friend class memtable_dirty_memory_manager;
-    friend class streaming_dirty_memory_manager;
-
    // If mutations are fragmented during streaming the sstables cannot be made
    // visible immediately after memtable flush, because that could cause
    // readers to see only a part of a partition thus violating isolation
@@ -751,7 +824,7 @@ private:
    // repair can now choose whatever strategy - small or big ranges - it wants, resting assure
    // that the incoming memtables will be coalesced together.
    shared_promise<> _waiting_streaming_flushes;
-    timer<> _delayed_streaming_flush{[this] { seal_active_streaming_memtable_immediate(); }};
+    timer<> _delayed_streaming_flush{[this] { _streaming_memtables->request_flush(); }};
    future<> seal_active_streaming_memtable_delayed();
    future<> seal_active_streaming_memtable_immediate();
    future<> seal_active_streaming_memtable(memtable_list::flush_behavior behavior) {
@@ -882,8 +955,6 @@ public:
        bool enable_disk_writes = true;
        bool enable_cache = true;
        bool enable_incremental_backups = false;
-        size_t max_memtable_size = 5'000'000;
-        size_t max_streaming_memtable_size = 5'000'000;
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
        restricted_mutation_reader_config read_concurrency_config;
@@ -978,11 +1049,11 @@ class database {
    lw_shared_ptr<db_stats> _stats;

    std::unique_ptr<db::config> _cfg;
-    size_t _memtable_total_space = 500 << 20;
-    size_t _streaming_memtable_total_space = 500 << 20;
-    memtable_dirty_memory_manager _system_dirty_memory_manager;
-    memtable_dirty_memory_manager _dirty_memory_manager;
-    streaming_dirty_memory_manager _streaming_dirty_memory_manager;
+
+    dirty_memory_manager _system_dirty_memory_manager;
+    dirty_memory_manager _dirty_memory_manager;
+    dirty_memory_manager _streaming_dirty_memory_manager;
+
    semaphore _read_concurrency_sem{max_concurrent_reads()};
    restricted_mutation_reader_config _read_concurrency_config;
    semaphore _system_read_concurrency_sem{max_system_concurrent_reads()};
--- a/database_fwd.hh
+++ b/database_fwd.hh
@@ -23,6 +23,7 @@

 // database.hh
 class database;
+class memtable_list;

 // mutation.hh
 class mutation;
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -58,6 +58,8 @@
 #include <core/fstream.hh>
 #include <seastar/core/memory.hh>
 #include <seastar/core/chunked_fifo.hh>
+#include <seastar/core/queue.hh>
+#include <seastar/core/sleep.hh>
 #include <net/byteorder.hh>

 #include "commitlog.hh"
@@ -78,6 +80,8 @@

 static logging::logger logger("commitlog");

+using namespace std::chrono_literals;
+
 class crc32_nbo {
    crc32 _c;
 public:
@@ -157,6 +161,7 @@ const std::string db::commitlog::descriptor::FILENAME_EXTENSION(".log");
 class db::commitlog::segment_manager : public ::enable_shared_from_this<segment_manager> {
 public:
    config cfg;
+    std::vector<sstring> _segments_to_replay;
    const uint64_t max_size;
    const uint64_t max_mutation_size;
    // Divide the size-on-disk threshold by #cpus used, since we assume
@@ -164,6 +169,7 @@ public:
    const uint64_t max_disk_size; // per-shard

    bool _shutdown = false;
+    std::experimental::optional<shared_promise<>> _shutdown_promise = {};

    semaphore _new_segment_semaphore {1};
    semaphore _flush_semaphore;
@@ -252,7 +258,7 @@ public:

    scollectd::registrations create_counters();

-    void orphan_all();
+    future<> orphan_all();

    void discard_unused_segments();
    void discard_completed_segments(const cf_id_type& id,
@@ -288,21 +294,19 @@ public:
    void flush_segments(bool = false);

 private:
+    future<> clear_reserve_segments();
+
+    size_t max_request_controller_units() const;
    segment_id_type _ids = 0;
    std::vector<sseg_ptr> _segments;
-    std::deque<sseg_ptr> _reserve_segments;
+    queue<sseg_ptr> _reserve_segments;
    std::vector<buffer_type> _temp_buffers;
    std::unordered_map<flush_handler_id, flush_handler> _flush_handlers;
    flush_handler_id _flush_ids = 0;
    replay_position _flush_position;
    timer<clock_type> _timer;
-    size_t _reserve_allocating = 0;
-    // # segments to try to keep available in reserve
-    // i.e. the amount of segments we expect to consume inbetween timer
-    // callbacks.
-    // The idea is that since the files are 0 len at start, and thus cost little,
-    // it is easier to adapt this value compared to timer freq.
-    size_t _num_reserve_segments = 0;
+    future<> replenish_reserve();
+    future<> _reserve_replenisher;
    seastar::gate _gate;
    uint64_t _new_counter = 0;
 };
@@ -870,7 +874,7 @@ db::commitlog::segment_manager::allocate_when_possible(const cf_id_type& id, sha
    }

    auto fut = get_units(_request_controller, size);
-    if (!fut.available()) {
+    if (_request_controller.waiters()) {
        totals.requests_blocked_memory++;
    }
    return fut.then([this, id, writer = std::move(writer)] (auto permit) mutable {
@@ -911,7 +915,9 @@ db::commitlog::segment_manager::segment_manager(config c)
    // an existing in-flight buffer. Since we'll force the cycling() of any buffer that is bigger
    // than default_size at the end of the allocation, that allows for every valid mutation to
    // always be admitted for processing.
-    , _request_controller(max_mutation_size + db::commitlog::segment::default_size)
+    , _request_controller(max_request_controller_units())
+    , _reserve_segments(1)
+    , _reserve_replenisher(make_ready_future<>())
 {
    assert(max_size > 0);

@@ -922,6 +928,32 @@ db::commitlog::segment_manager::segment_manager(config c)
    _regs = create_counters();
 }

+size_t db::commitlog::segment_manager::max_request_controller_units() const {
+    return max_mutation_size + db::commitlog::segment::default_size;
+}
+
+future<> db::commitlog::segment_manager::replenish_reserve() {
+    return do_until([this] { return _shutdown; }, [this] {
+        return _reserve_segments.not_full().then([this] {
+            if (_shutdown) {
+                return make_ready_future<>();
+            }
+            return with_gate(_gate, [this] {
+                return this->allocate_segment(false).then([this](sseg_ptr s) {
+                    auto ret = _reserve_segments.push(std::move(s));
+                    if (!ret) {
+                        logger.error("Segment reserve is full! Ignoring and trying to continue, but shouldn't happen");
+                    }
+                    return make_ready_future<>();
+                });
+            }).handle_exception([](std::exception_ptr ep) {
+                logger.warn("Exception in segment reservation: {}", ep);
+                return sleep(100ms);
+            });
+        });
+    });
+}
+
 future<std::vector<db::commitlog::descriptor>>
 db::commitlog::segment_manager::list_descriptors(sstring dirname) {
    struct helper {
@@ -981,9 +1013,11 @@ db::commitlog::segment_manager::list_descriptors(sstring dirname) {

 future<> db::commitlog::segment_manager::init() {
    return list_descriptors(cfg.commit_log_location).then([this](std::vector<descriptor> descs) {
+        assert(_reserve_segments.empty()); // _segments_to_replay must not pick them up
        segment_id_type id = std::chrono::duration_cast<std::chrono::milliseconds>(runtime::get_boot_time().time_since_epoch()).count() + 1;
        for (auto& d : descs) {
            id = std::max(id, replay_position(d.id).base_id());
+            _segments_to_replay.push_back(cfg.commit_log_location + "/" + d.filename());
        }

        // base id counter is [ <shard> | <base> ]
@@ -992,6 +1026,9 @@ future<> db::commitlog::segment_manager::init() {
        _timer.set_callback(std::bind(&segment_manager::on_timer, this));
        auto delay = engine().cpu_id() * std::ceil(double(cfg.commitlog_sync_period_in_ms) / smp::count);
        logger.trace("Delaying timer loop {} ms", delay);
+        // We need to wait until we have scanned all other segments to actually start serving new
+        // segments. We are ready now
+        this->_reserve_replenisher = replenish_reserve();
        this->arm(delay);
    });
 }
@@ -1139,22 +1176,15 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:

    ++_new_counter;

-    if (_reserve_segments.empty()) {
-        if (_num_reserve_segments < cfg.max_reserve_segments) {
-            ++_num_reserve_segments;
-            logger.trace("Increased segment reserve count to {}", _num_reserve_segments);
-        }
-        return allocate_segment(true).then([this](sseg_ptr s) {
-            _segments.push_back(s);
-            return make_ready_future<sseg_ptr>(s);
-        });
+    if (_reserve_segments.empty() && (_reserve_segments.max_size() < cfg.max_reserve_segments)) {
+        _reserve_segments.set_max_size(_reserve_segments.max_size() + 1);
+        logger.debug("Increased segment reserve count to {}", _reserve_segments.max_size());
    }
-
-    _segments.push_back(_reserve_segments.front());
-    _reserve_segments.pop_front();
-    _segments.back()->reset_sync_time();
-    logger.trace("Acquired segment {} from reserve", _segments.back());
-    return make_ready_future<sseg_ptr>(_segments.back());
+    return _reserve_segments.pop_eventually().then([this] (auto s) {
+        _segments.push_back(std::move(s));
+        _segments.back()->reset_sync_time();
+        return make_ready_future<sseg_ptr>(_segments.back());
+    });
 }

 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::active_segment() {
@@ -1222,6 +1252,15 @@ void db::commitlog::segment_manager::discard_unused_segments() {
    }
 }

+// FIXME: pop() will call unlink -> sleeping in reactor thread.
+// Not urgent since mostly called during shutdown, but have to fix.
+future<> db::commitlog::segment_manager::clear_reserve_segments() {
+    while (!_reserve_segments.empty()) {
+        _reserve_segments.pop();
+    }
+    return make_ready_future<>();
+}
+
 future<> db::commitlog::segment_manager::sync_all_segments(bool shutdown) {
    logger.debug("Issuing sync for all segments");
    return parallel_for_each(_segments, [this, shutdown](sseg_ptr s) {
@@ -1232,19 +1271,40 @@ future<> db::commitlog::segment_manager::sync_all_segments(bool shutdown) {
 }

 future<> db::commitlog::segment_manager::shutdown() {
-    if (!_shutdown) {
-        _shutdown = true; // no re-arm, no create new segments.
-        _timer.cancel(); // no more timer calls
-        // Now first wait for periodic task to finish, then sync and close all
-        // segments, flushing out any remaining data.
-        return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true));
+    if (!_shutdown_promise) {
+        _shutdown_promise = shared_promise<>();
+
+        // Wait for all pending requests to finish. Need to sync first because segments that are
+        // alive may be holding semaphore permits.
+        auto block_new_requests = get_units(_request_controller, max_request_controller_units());
+        return sync_all_segments(false).then([this, block_new_requests = std::move(block_new_requests)] () mutable {
+            return std::move(block_new_requests).then([this] (auto permits) {
+                _timer.cancel(); // no more timer calls
+                _shutdown = true; // no re-arm, no create new segments.
+                // Now first wait for periodic task to finish, then sync and close all
+                // segments, flushing out any remaining data.
+                return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true));
+            });
+        }).finally([this] {
+            // Now that the gate is closed and requests completed we are sure nobody else will pop()
+            return clear_reserve_segments().finally([this] {
+                return std::move(_reserve_replenisher).then_wrapped([this] (auto f) {
+                    // Could be cleaner with proper seastar support
+                    if (f.failed()) {
+                        _shutdown_promise->set_exception(f.get_exception());
+                    } else {
+                        _shutdown_promise->set_value();
+                    }
+                });
+            });
+        });
    }
-    return make_ready_future<>();
+    return _shutdown_promise->get_shared_future();
 }

-void db::commitlog::segment_manager::orphan_all() {
+future<> db::commitlog::segment_manager::orphan_all() {
    _segments.clear();
-    _reserve_segments.clear();
+    return clear_reserve_segments();
 }

 /*
@@ -1259,7 +1319,7 @@ future<> db::commitlog::segment_manager::clear() {
        for (auto& s : _segments) {
            s->mark_clean();
        }
-        orphan_all();
+        return orphan_all();
    });
 }
 /**
@@ -1290,37 +1350,7 @@ void db::commitlog::segment_manager::on_timer() {
                flush_segments();
            }
        }
-        // take outstanding allocations into regard. This is paranoid,
-        // but if for some reason the file::open takes longer than timer period,
-        // we could flood the reserve list with new segments
-        //
-        // #482 - _reserve_allocating is decremented in the finally clause below.
-        // This is needed because if either allocate_segment _or_ emplacing into
-        // _reserve_segments should throw, we still need the counter reset
-        // However, because of this, it might be that emplace was done, but not decrement,
-        // when we get here again. So occasionally we might get a sum of the two that is
-        // not consistent. It should however always just potentially be _to much_, i.e.
-        // just an indicator that we don't need to do anything. So lets do that.
-        auto n = std::min(_reserve_segments.size() + _reserve_allocating, _num_reserve_segments);
-        return parallel_for_each(boost::irange(n, _num_reserve_segments), [this, n](auto i) {
-            ++_reserve_allocating;
-            return this->allocate_segment(false).then([this](sseg_ptr s) {
-                if (!_shutdown) {
-                    // insertion sort.
-                    auto i = std::upper_bound(_reserve_segments.begin(), _reserve_segments.end(), s, [](sseg_ptr s1, sseg_ptr s2) {
-                        const descriptor& d1 = s1->_desc;
-                        const descriptor& d2 = s2->_desc;
-                        return d1.id < d2.id;
-                    });
-                    i = _reserve_segments.emplace(i, std::move(s));
-                    logger.trace("Added reserve segment {}", *i);
-                }
-            }).finally([this] {
-                --_reserve_allocating;
-            });
-        });
-    }).handle_exception([](std::exception_ptr ep) {
-        logger.warn("Exception in segment reservation: {}", ep);
+        return make_ready_future<>();
    });
    arm();
 }
@@ -1538,6 +1568,15 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
 subscription<temporary_buffer<char>, db::replay_position>
 db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type off) {
    struct work {
+    private:
+        file_input_stream_options make_file_input_stream_options() {
+            file_input_stream_options fo;
+            fo.buffer_size = db::commitlog::segment::default_size;
+            fo.read_ahead = 10;
+            fo.io_priority_class = service::get_local_commitlog_priority();
+            return fo;
+        }
+    public:
        file f;
        stream<temporary_buffer<char>, replay_position> s;
        input_stream<char> fin;
@@ -1553,7 +1592,7 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
        bool header = true;

        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f)), start_off(o) {
+                : f(f), fin(make_file_input_stream(f, o, make_file_input_stream_options())), start_off(o) {
        }
        work(work&&) = default;

@@ -1736,6 +1775,8 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
                      throw segment_data_corruption_error("Data corruption", corrupt_size);
                  }
                });
+            }).finally([this] {
+                return fin.close();
            });
        }
    };
@@ -1822,3 +1863,6 @@ future<std::vector<sstring>> db::commitlog::list_existing_segments(const sstring
    });
 }

+std::vector<sstring> db::commitlog::get_segments_to_replay() {
+    return std::move(_segment_manager->_segments_to_replay);
+}
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -241,6 +241,14 @@ public:
     */
    std::vector<sstring> get_active_segment_names() const;

+    /**
+     * Returns a vector of segment paths which were
+     * preexisting when this instance of commitlog was created.
+     *
+     * The list will be empty when called for the second time.
+     */
+    std::vector<sstring> get_segments_to_replay();
+
    uint64_t get_total_size() const;
    uint64_t get_completed_tasks() const;
    uint64_t get_flush_count() const;
--- a/db/config.hh
+++ b/db/config.hh
@@ -256,7 +256,7 @@ public:
            "Log a warning when compacting partitions larger than this value"   \
    )                                               \
    /* Common memtable settings */  \
-    val(memtable_total_space_in_mb, uint32_t, 0, Used,     \
+    val(memtable_total_space_in_mb, uint32_t, 0, Invalid,     \
            "Specifies the total memory used for all memtables on a node. This replaces the per-table storage settings memtable_operations_in_millions and memtable_throughput_in_mb."  \
    )                                                   \
    /* Common disk settings */  \
@@ -334,7 +334,7 @@ public:
            "\toffheap_buffers  Off heap (direct) NIO buffers.\n"   \
            "\toffheap_objects  Native memory, eliminating NIO buffer heap overhead."   \
    )                                                   \
-    val(memtable_cleanup_threshold, double, .11, Used, \
+    val(memtable_cleanup_threshold, double, .11, Invalid, \
            "Ratio of occupied non-flushing memtable size to total permitted size for triggering a flush of the largest memtable. Larger values mean larger flushes and less compaction, but also less concurrent flush activity, which can make it difficult to keep your disks saturated under heavy write load." \
    )   \
    val(file_cache_size_in_mb, uint32_t, 512, Unused,  \
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1022,6 +1022,10 @@ std::vector<schema_ptr> all_tables() {
    return r;
 }

+static bool maybe_write_in_user_memory(schema_ptr s, database& db) {
+    return (s.get() == batchlog().get());
+}
+
 void make(database& db, bool durable, bool volatile_testing_only) {
    auto ksm = make_lw_shared<keyspace_metadata>(NAME,
            "org.apache.cassandra.locator.LocalStrategy",
@@ -1045,7 +1049,11 @@ void make(database& db, bool durable, bool volatile_testing_only) {
    db.add_keyspace(NAME, std::move(_ks));
    auto& ks = db.find_keyspace(NAME);
    for (auto&& table : all_tables()) {
-        db.add_column_family(table, ks.make_column_family_config(*table, db.get_config()));
+        auto cfg = ks.make_column_family_config(*table, db.get_config());
+        if (maybe_write_in_user_memory(table, db)) {
+            cfg.dirty_memory_manager = &db._dirty_memory_manager;
+        }
+        db.add_column_family(table, std::move(cfg));
    }
 }

--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -8,7 +8,9 @@ fi
 print_usage() {
    echo "build_ami.sh --localrpm --repo [URL]"
    echo "  --localrpm  deploy locally built rpms"
-    echo "  --repo  specify .repo/.list file URL"
+    echo "  --repo  repository for both install and update, specify .repo/.list file URL"
+    echo "  --repo-for-install  repository for install, specify .repo/.list file URL"
+    echo "  --repo-for-update  repository for update, specify .repo/.list file URL"
    exit 1
 }
 LOCALRPM=0
@@ -24,6 +26,14 @@ while [ $# -gt 0 ]; do
            INSTALL_ARGS="$INSTALL_ARGS --repo $2"
            shift 2
            ;;
+        "--repo-for-install")
+            INSTALL_ARGS="$INSTALL_ARGS --repo-for-install $2"
+            shift 2
+            ;;
+        "--repo-for-update")
+            INSTALL_ARGS="$INSTALL_ARGS --repo-for-update $2"
+            shift 2
+            ;;
        *)
            print_usage
            ;;
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/scripts/scylla_kernel_check
+++ b/dist/common/scripts/scylla_kernel_check
@@ -30,6 +30,6 @@ else
    else
        echo "Please upgrade to a newer kernel version."
    fi
-    echo " see http://docs.scylladb.com/kb/kb-fs-not-qualified-aio/ for details"
+    echo " see http://www.scylladb.com/kb/kb-fs-not-qualified-aio/ for details"
 fi
 exit $RET
--- a/dist/common/systemd/scylla-housekeeping.timer
+++ b/dist/common/systemd/scylla-housekeeping.timer
@@ -4,7 +4,8 @@ After=scylla-server.service
 BindsTo=scylla-server.service

 [Timer]
-OnBootSec=0
+# set OnActiveSec to 3 to safely avoid issues/1846
+OnActiveSec=3
 OnUnitActiveSec=1d

 [Install]
--- a/dist/common/systemd/scylla-server.service.in
+++ b/dist/common/systemd/scylla-server.service.in
@@ -8,7 +8,7 @@ Wants=scylla-housekeeping.timer
 PermissionsStartOnly=true
 Type=notify
 LimitMEMLOCK=infinity
-LimitNOFILE=200000
+LimitNOFILE=800000
 LimitAS=infinity
 LimitNPROC=8096
 EnvironmentFile=@@SYSCONFDIR@@/scylla-server
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -7,7 +7,7 @@ ENV container docker
 VOLUME [ "/sys/fs/cgroup" ]

 #install scylla
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-1.5.repo -o /etc/yum.repos.d/scylla.repo
 RUN yum -y install epel-release
 RUN yum -y clean expire-cache
 RUN yum -y update
--- a/dist/ubuntu/debian/scylla-server.upstart
+++ b/dist/ubuntu/debian/scylla-server.upstart
@@ -20,7 +20,7 @@ setuid scylla
 setgid scylla
 limit core unlimited unlimited
 limit memlock unlimited unlimited
-limit nofile 200000 200000
+limit nofile 800000 800000
 limit as unlimited unlimited
 limit nproc 8096 8096
 chdir /var/lib/scylla
--- a/main.cc
+++ b/main.cc
@@ -184,8 +184,8 @@ public:
                        throw;
                    }
                });
-            } catch (std::system_error& e) {
-                startlog.error("Directory '{}' not found. Tried to created it but failed: {}", path, e.what());
+            } catch (...) {
+                startlog.error("Directory '{}' cannot be initialized. Tried to do it but failed with: {}", path, std::current_exception());
                throw;
            }
        });
@@ -466,6 +466,10 @@ int main(int ac, char** av) {
            // that can happen, making existing problems worse. So running a single shard first
            // and getting making sure that all temporary tables are deleted provides extra
            // protection against such situations.
+            //
+            // We also need to init commitlog on shard0 before it is inited on other shards
+            // because it obtains the list of pre-existing segments for replay, which must
+            // not include reserve segments created by active commitlogs.
            db.invoke_on(0, [] (database& db) { return db.init_system_keyspace(); }).get();
            db.invoke_on_all([] (database& db) {
                if (engine().cpu_id() == 0) {
@@ -538,7 +542,7 @@ int main(int ac, char** av) {
            supervisor_notify("starting commit log");
            auto cl = db.local().commitlog();
            if (cl != nullptr) {
-                auto paths = cl->list_existing_segments().get0();
+                auto paths = cl->get_segments_to_replay();
                if (!paths.empty()) {
                    supervisor_notify("replaying commit log");
                    auto rp = db::commitlog_replayer::create_replayer(qp).get0();
--- a/memtable.cc
+++ b/memtable.cc
@@ -26,8 +26,16 @@

 namespace stdx = std::experimental;

-memtable::memtable(schema_ptr schema, logalloc::region_group* dirty_memory_region_group)
+memtable::memtable(schema_ptr schema, memtable_list* memtable_list)
+        : logalloc::region(memtable_list ? logalloc::region(memtable_list->region_group()) : logalloc::region())
+        , _memtable_list(memtable_list)
+        , _schema(std::move(schema))
+        , partitions(memtable_entry::compare(_schema)) {
+}
+
+memtable::memtable(schema_ptr schema, logalloc::region_group *dirty_memory_region_group)
        : logalloc::region(dirty_memory_region_group ? logalloc::region(*dirty_memory_region_group) : logalloc::region())
+        , _memtable_list(nullptr)
        , _schema(std::move(schema))
        , partitions(memtable_entry::compare(_schema)) {
 }
@@ -254,7 +262,7 @@ class flush_memory_accounter {
 public:
    void update_bytes_read(uint64_t delta) {
        _bytes_read += delta;
-        dirty_memory_manager::from_region_group(_region.group()).account_potentially_cleaned_up_memory(delta);
+        dirty_memory_manager::from_region_group(_region.group()).account_potentially_cleaned_up_memory(&_region, delta);
    }

    explicit flush_memory_accounter(logalloc::region& region)
@@ -263,7 +271,7 @@ public:

    ~flush_memory_accounter() {
        assert(_bytes_read <= _region.occupancy().used_space());
-        dirty_memory_manager::from_region_group(_region.group()).revert_potentially_cleaned_up_memory(_bytes_read);
+        dirty_memory_manager::from_region_group(_region.group()).revert_potentially_cleaned_up_memory(&_region, _bytes_read);
    }
    void account_component(memtable_entry& e) {
        auto delta = _region.allocator().object_memory_size_in_allocator(&e)
--- a/memtable.hh
+++ b/memtable.hh
@@ -101,6 +101,7 @@ public:
        bi::member_hook<memtable_entry, bi::set_member_hook<>, &memtable_entry::_link>,
        bi::compare<memtable_entry::compare>>;
 private:
+    memtable_list *_memtable_list;
    schema_ptr _schema;
    logalloc::allocating_section _read_section;
    logalloc::allocating_section _allocating_section;
@@ -116,7 +117,9 @@ private:
    partition_entry& find_or_create_partition_slow(partition_key_view key);
    void upgrade_entry(memtable_entry&);
 public:
-    explicit memtable(schema_ptr schema, logalloc::region_group* dirty_memory_region_group = nullptr);
+    explicit memtable(schema_ptr schema, memtable_list *memtable_list);
+    // Used for testing that want to control the flush process.
+    explicit memtable(schema_ptr schema, logalloc::region_group *dirty_memrory_region= nullptr);
    ~memtable();
    schema_ptr schema() const { return _schema; }
    void set_schema(schema_ptr) noexcept;
@@ -134,7 +137,15 @@ public:
    const logalloc::region& region() const {
        return *this;
    }
+
+    logalloc::region_group* region_group() {
+        return group();
+    }
 public:
+    memtable_list* get_memtable_list() {
+        return _memtable_list;
+    }
+
    size_t partition_count() const;
    logalloc::occupancy_stats occupancy() const;

--- a/partition_version.hh
+++ b/partition_version.hh
@@ -474,9 +474,9 @@ public:
                try {
                    _read_section(_lsa_region, [this] {
                        _snapshot->merge_partition_versions();
-                        _snapshot = {};
                    });
                } catch (...) { }
+                _snapshot = {};
            });
        });
    }
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -74,8 +74,7 @@ cache_tracker::cache_tracker() {
            }
            cache_entry& ce = _lru.back();
            auto it = row_cache::partitions_type::s_iterator_to(ce);
-            --it;
-            clear_continuity(*it);
+            clear_continuity(*std::next(it));
            _lru.pop_back_and_dispose(current_deleter<cache_entry>());
            --_partitions;
            ++_evictions;
@@ -365,6 +364,7 @@ public:
            ++_it;
            _last = ce.key();
            _cache.upgrade_entry(ce);
+            _cache._tracker.touch(ce);
            _cache.on_hit();
            cache_data cd { { }, ce.continuous() };
            if (ce.wide_partition()) {
@@ -546,7 +546,6 @@ private:
        if (!_first_element) {
            return false;
        }
-        _first_element = false;
        return _pr.start() && _pr.start()->is_inclusive() && _pr.start()->value().equal(*_schema, dk);
    }

@@ -554,6 +553,7 @@ private:
        return _primary_reader().then([this] (just_cache_scanning_reader::cache_data cd) {
            auto& smopt = cd.mut;
            if (cd.continuous || (smopt && is_inclusive_start_bound(smopt->decorated_key()))) {
+                _first_element = false;
                update_last_key(smopt);
                return make_ready_future<streamed_mutation_opt>(std::move(smopt));
            } else {
@@ -682,7 +682,9 @@ row_cache::make_reader(schema_ptr s,
 row_cache::~row_cache() {
    with_allocator(_tracker.allocator(), [this] {
        _partitions.clear_and_dispose([this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
-            _tracker.on_erase();
+            if (!p->is_dummy_entry()) {
+                _tracker.on_erase();
+            }
            deleter(p);
        });
    });
@@ -720,7 +722,9 @@ void row_cache::do_find_or_create_entry(const dht::decorated_key& key,
                    return;
                }

-                if ((!previous->_key && i == _partitions.begin()) || (previous->_key && std::prev(i)->key().equal(*_schema, *previous->_key))) {
+                if ((!previous->_key && i == _partitions.begin())
+                    || (previous->_key && i != _partitions.begin()
+                        && std::prev(i)->key().equal(*_schema, *previous->_key))) {
                    i->set_continuous(true);
                }
            });
--- a/scripts/scylla_install_pkg
+++ b/scripts/scylla_install_pkg
@@ -10,13 +10,16 @@ fi
 print_usage() {
    echo "scylla_install_pkg --local-pkg /home/scylla/rpms --repo [URL]"
    echo "  --local-pkg	install locally built .rpm/.deb on specified directory"
-    echo "  --repo specify .repo/.list file URL"
+    echo "  --repo  repository for both install and update, specify .repo/.list file URL"
+    echo "  --repo-for-install  repository for install, specify .repo/.list file URL"
+    echo "  --repo-for-update  repository for update, specify .repo/.list file URL"
    exit 1
 }

 LOCAL_PKG=
 UNSTABLE=0
-REPO=
+REPO_FOR_INSTALL=
+REPO_FOR_UPDATE=
 while [ $# -gt 0 ]; do
    case "$1" in
        "--local-pkg")
@@ -24,7 +27,16 @@ while [ $# -gt 0 ]; do
            shift 2
            ;;
        "--repo")
-            REPO=$2
+            REPO_FOR_INSTALL=$2
+            REPO_FOR_UPDATE=$2
+            shift 2
+            ;;
+        "--repo-for-install")
+            REPO_FOR_INSTALL=$2
+            shift 2
+            ;;
+        "--repo-for-update")
+            REPO_FOR_UPDATE=$2
            shift 2
            ;;
        *)
@@ -42,8 +54,8 @@ if [ "$ID" = "ubuntu" ]; then
    chmod +x /usr/sbin/policy-rc.d
    cp /etc/hosts /etc/hosts.orig
    echo 127.0.0.1 `hostname` >> /etc/hosts
-    if [ "$REPO" != "" ]; then
-        curl -o /etc/apt/sources.list.d/scylla.list $REPO
+    if [ "$REPO_FOR_INSTALL" != "" ]; then
+        curl -o /etc/apt/sources.list.d/scylla_install.list $REPO_FOR_INSTALL
    fi
    apt-get update
    if [ "$LOCAL_PKG" = "" ]; then
@@ -62,9 +74,14 @@ if [ "$ID" = "ubuntu" ]; then
    fi
    mv /etc/hosts.orig /etc/hosts
    rm /usr/sbin/policy-rc.d
+    rm /etc/apt/sources.list.d/scylla_install.list
+    if [ "$REPO_FOR_UPDATE" != "" ]; then
+        curl -o /etc/apt/sources.list.d/scylla.list $REPO_FOR_UPDATE
+    fi
+    apt-get update
 else
-    if [ "$REPO" != "" ]; then
-        curl -o /etc/yum.repos.d/scylla.repo $REPO
+    if [ "$REPO_FOR_INSTALL" != "" ]; then
+        curl -o /etc/yum.repos.d/scylla_install.repo $REPO_FOR_INSTALL
    fi

    if [ "$ID" = "centos" ]; then
@@ -81,4 +98,9 @@ else
    else
        yum install -y $LOCAL_PKG/scylla*.*.rpm
    fi
+
+    rm /etc/yum.repos.d/scylla_install.repo
+    if [ "$REPO_FOR_UPDATE" != "" ]; then
+        curl -o /etc/yum.repos.d/scylla.repo $REPO_FOR_UPDATE
+    fi
 fi
--- a/2
+++ b/2
--- a/service/pager/query_pagers.cc
+++ b/service/pager/query_pagers.cc
@@ -54,7 +54,7 @@ public:
                    const cql3::query_options& options,
                    lw_shared_ptr<query::read_command> cmd,
                    std::vector<query::partition_range> ranges)
-                    : _has_clustering_keys(s->clustering_key_size() > 0)
+                    : _has_clustering_keys(has_clustering_keys(*s, *cmd))
                    , _max(cmd->row_limit)
                    , _schema(std::move(s))
                    , _selection(selection)
@@ -65,6 +65,11 @@ public:
    {}

 private:   
+    static bool has_clustering_keys(const schema& s, const query::read_command& cmd) {
+        return s.clustering_key_size() > 0
+               && !cmd.slice.options.contains<query::partition_slice::option::distinct>();
+    }
+
    future<> fetch_page(cql3::selection::result_set_builder& builder, uint32_t page_size, db_clock::time_point now) override {
        auto state = _options.get_paging_state();

--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -2643,12 +2643,19 @@ storage_proxy::query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
        }
    }

+    // estimate_result_rows_per_range() is currently broken, and this is not needed
+    // when paging is available in any case
+#if 0
    // our estimate of how many result rows there will be per-range
    float result_rows_per_range = estimate_result_rows_per_range(cmd, ks);
    // underestimate how many rows we will get per-range in order to increase the likelihood that we'll
    // fetch enough rows in the first round
    result_rows_per_range -= result_rows_per_range * CONCURRENT_SUBREQUESTS_MARGIN;
    int concurrency_factor = result_rows_per_range == 0.0 ? 1 : std::max(1, std::min(int(ranges.size()), int(std::ceil(cmd->row_limit / result_rows_per_range))));
+#else
+    int result_rows_per_range = 0;
+    int concurrency_factor = 1;
+#endif

    std::vector<foreign_ptr<lw_shared_ptr<query::result>>> results;
    results.reserve(ranges.size()/concurrency_factor + 1);
--- a/sstables/atomic_deletion.cc
+++ b/sstables/atomic_deletion.cc
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "atomic_deletion.hh"
+#include "to_string.hh"
+#include <seastar/core/shared_future.hh>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/copy.hpp>
+
+namespace sstables {
+
+atomic_deletion_manager::atomic_deletion_manager(unsigned shard_count,
+        std::function<future<> (std::vector<sstring> sstables)> delete_sstables)
+        : _shard_count(shard_count)
+        , _delete_sstables(std::move(delete_sstables)) {
+}
+
+future<>
+atomic_deletion_manager::delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set, unsigned deleting_shard) {
+    // runs on shard 0 only
+    _deletion_logger.debug("shard {} atomically deleting {}", deleting_shard, atomic_deletion_set);
+
+    if (_atomic_deletions_cancelled) {
+        _deletion_logger.debug("atomic deletions disabled, erroring out");
+        using boost::adaptors::transformed;
+        throw atomic_deletion_cancelled(atomic_deletion_set
+                                        | transformed(std::mem_fn(&sstable_to_delete::name)));
+    }
+
+    // Insert atomic_deletion_set into the list of sets pending deletion.  If the new set
+    // overlaps with an existing set, merge them (the merged set will be deleted atomically).
+    std::unordered_map<sstring, lw_shared_ptr<pending_deletion>> new_atomic_deletion_sets;
+    auto merged_set = make_lw_shared(pending_deletion());
+    for (auto&& sst_to_delete : atomic_deletion_set) {
+        merged_set->names.insert(sst_to_delete.name);
+        if (!sst_to_delete.shared) {
+            for (auto shard : boost::irange<shard_id>(0, _shard_count)) {
+                _shards_agreeing_to_delete_sstable[sst_to_delete.name].insert(shard);
+            }
+        }
+        new_atomic_deletion_sets.emplace(sst_to_delete.name, merged_set);
+    }
+    auto pr = make_lw_shared<promise<>>();
+    merged_set->completions.insert(pr);
+    auto ret = pr->get_future();
+    for (auto&& sst_to_delete : atomic_deletion_set) {
+        auto i = _atomic_deletion_sets.find(sst_to_delete.name);
+        // merge from old deletion set to new deletion set
+        // i->second can be nullptr, see below why
+        if (i != _atomic_deletion_sets.end() && i->second) {
+            boost::copy(i->second->names, std::inserter(merged_set->names, merged_set->names.end()));
+            boost::copy(i->second->completions, std::inserter(merged_set->completions, merged_set->completions.end()));
+        }
+    }
+    _deletion_logger.debug("new atomic set: {}", merged_set->names);
+    // we need to merge new_atomic_deletion_sets into g_atomic_deletion_sets,
+    // but beware of exceptions.  We do that with a first pass that inserts
+    // nullptr as the value, so the second pass only replaces, and does not allocate
+    for (auto&& sst_to_delete : atomic_deletion_set) {
+        _atomic_deletion_sets.emplace(sst_to_delete.name, nullptr);
+    }
+    // now, no allocations are involved, so this commits the operation atomically
+    for (auto&& n : merged_set->names) {
+        auto i = _atomic_deletion_sets.find(n);
+        i->second = merged_set;
+    }
+
+    // Mark each sstable as being deleted from deleting_shard.  We have to do
+    // this in a separate pass, so the consideration whether we can delete or not
+    // sees all the data from this pass.
+    for (auto&& sst : atomic_deletion_set) {
+        _shards_agreeing_to_delete_sstable[sst.name].insert(deleting_shard);
+    }
+
+    // Figure out if the (possibly merged) set can be deleted
+    for (auto&& sst : merged_set->names) {
+        if (_shards_agreeing_to_delete_sstable[sst].size() != _shard_count) {
+            // Not everyone agrees, leave the set pending
+            _deletion_logger.debug("deferring deletion until all shards agree");
+            return ret;
+        }
+    }
+
+    // Cannot recover from a failed deletion
+    for (auto&& name : merged_set->names) {
+        _atomic_deletion_sets.erase(name);
+        _shards_agreeing_to_delete_sstable.erase(name);
+    }
+
+    // Everyone agrees, let's delete
+    auto names = boost::copy_range<std::vector<sstring>>(merged_set->names);
+    _deletion_logger.debug("deleting {}", names);
+    return _delete_sstables(names).then_wrapped([this, merged_set] (future<> result) {
+        _deletion_logger.debug("atomic deletion completed: {}", merged_set->names);
+        shared_future<> sf(std::move(result));
+        for (auto&& comp : merged_set->completions) {
+            sf.get_future().forward_to(std::move(*comp));
+        }
+    });
+
+    return ret;
+}
+
+void
+atomic_deletion_manager::cancel_atomic_deletions() {
+    _atomic_deletions_cancelled = true;
+    for (auto&& pd : _atomic_deletion_sets) {
+        if (!pd.second) {
+            // Could happen if a delete_atomically() failed
+            continue;
+        }
+        for (auto&& c : pd.second->completions) {
+            c->set_exception(atomic_deletion_cancelled(pd.second->names));
+        }
+        // since sets are shared, make sure we don't hit the same one again
+        pd.second->completions.clear();
+    }
+    _atomic_deletion_sets.clear();
+    _shards_agreeing_to_delete_sstable.clear();
+}
+
+}
--- a/sstables/atomic_deletion.hh
+++ b/sstables/atomic_deletion.hh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+// The atomic deletion manager solves the problem of orchestrating
+// the deletion of files that must be deleted as a group, where each
+// shard has different groups, and all shards delete a file for it to
+// be deleted.  For example,
+//
+//  shard 0: delete "A"
+//     we can't delete anything because shard 1 hasn't agreed yet.
+//  shard 1: delete "A" and B"
+//     shard 1 agrees to delete "A", but we can't delete it yet,
+//     because shard 1 requires that it be deleted together with "B",
+//     and shard 0 hasn't agreed to delete "B" yet.
+//  shard 0: delete "B" and "C"
+//     shards 0 and 1 now both agree to delete "A" and "B", but shard 0
+//     doesn't allow us to delete "B" without "C".
+//  shard 1: delete "C"
+//     finally, we can delete "A", "B", and "C".
+
+#include "log.hh"
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/reactor.hh> // for shard_id
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+
+namespace sstables {
+
+struct sstable_to_delete {
+    sstable_to_delete(sstring name, bool shared) : name(std::move(name)), shared(shared) {}
+    sstring name;
+    bool shared = false;
+    friend std::ostream& operator<<(std::ostream& os, const sstable_to_delete& std);
+};
+
+class atomic_deletion_cancelled : public std::exception {
+    std::string _msg;
+public:
+    explicit atomic_deletion_cancelled(std::vector<sstring> names);
+    template <typename StringRange>
+    explicit atomic_deletion_cancelled(StringRange range)
+            : atomic_deletion_cancelled(std::vector<sstring>{range.begin(), range.end()}) {
+    }
+    const char* what() const noexcept override;
+};
+
+class atomic_deletion_manager {
+    logging::logger _deletion_logger{"sstable-deletion"};
+    using shards_agreeing_to_delete_sstable_type = std::unordered_set<shard_id>;
+    using sstables_to_delete_atomically_type = std::set<sstring>;
+    struct pending_deletion {
+        sstables_to_delete_atomically_type names;
+        std::unordered_set<lw_shared_ptr<promise<>>> completions;
+    };
+    bool _atomic_deletions_cancelled = false;
+    // map from sstable name to a set of sstables that must be deleted atomically, including itself
+    std::unordered_map<sstring, lw_shared_ptr<pending_deletion>> _atomic_deletion_sets;
+    std::unordered_map<sstring, shards_agreeing_to_delete_sstable_type> _shards_agreeing_to_delete_sstable;
+    unsigned _shard_count;
+    std::function<future<> (std::vector<sstring> sstables)> _delete_sstables;
+public:
+    atomic_deletion_manager(unsigned shard_count,
+            std::function<future<> (std::vector<sstring> sstables)> delete_sstables);
+    future<> delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set, unsigned deleting_shard);
+    void cancel_atomic_deletions();
+};
+
+}
--- a/sstables/compress.cc
+++ b/sstables/compress.cc
@@ -121,7 +121,11 @@ size_t compress_lz4(const char* input, size_t input_len,
    output[1] = (input_len >> 8) & 0xFF;
    output[2] = (input_len >> 16) & 0xFF;
    output[3] = (input_len >> 24) & 0xFF;
+#ifdef HAVE_LZ4_COMPRESS_DEFAULT
+    auto ret = LZ4_compress_default(input, output + 4, input_len, LZ4_compressBound(input_len));
+#else
    auto ret = LZ4_compress(input, output + 4, input_len);
+#endif
    if (ret == 0) {
        throw std::runtime_error("LZ4 compression failure: LZ4_compress() failed");
    }
--- a/sstables/consumer.hh
+++ b/sstables/consumer.hh
@@ -305,7 +305,7 @@ public:
        _remain = end - _stream_position;

        _prestate = prestate::NONE;
-        state_processor().reset();
+        state_processor().reset(begin);
        return _input.skip(n);
    }

--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -38,7 +38,7 @@ public:
    bool should_continue() {
        return indexes.size() < max_quantity;
    }
-    void consume_entry(index_entry&& ie) {
+    void consume_entry(index_entry&& ie, uint64_t offset) {
        indexes.push_back(std::move(ie));
    }
    void reset() {
@@ -49,13 +49,14 @@ public:
 // IndexConsumer is a concept that implements:
 //
 // bool should_continue();
-// void consume_entry(index_entry&& ie);
+// void consume_entry(index_entry&& ie, uintt64_t offset);
 template <class IndexConsumer>
 class index_consume_entry_context: public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
    using proceed = data_consumer::proceed;
    using continuous_data_consumer = data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>>;
 private:
    IndexConsumer& _consumer;
+    uint64_t _entry_offset;

    enum class state {
        START,
@@ -113,9 +114,12 @@ public:
                _state = state::CONSUME_ENTRY;
                break;
            }
-        case state::CONSUME_ENTRY:
-            _consumer.consume_entry(index_entry(std::move(_key), this->_u64, std::move(_promoted)));
+        case state::CONSUME_ENTRY: {
+            auto len = (_key.size() + _promoted.size() + 14);
+            _consumer.consume_entry(index_entry(std::move(_key), this->_u64, std::move(_promoted)), _entry_offset);
+            _entry_offset += len;
            _state = state::START;
+        }
            break;
        default:
            throw malformed_sstable_exception("unknown state");
@@ -126,11 +130,12 @@ public:
    index_consume_entry_context(IndexConsumer& consumer,
            input_stream<char>&& input, uint64_t start, uint64_t maxlen)
        : continuous_data_consumer(std::move(input), start, maxlen)
-        , _consumer(consumer)
+        , _consumer(consumer), _entry_offset(start)
    {}

-    void reset() {
+    void reset(uint64_t offset) {
        _state = state::START;
+        _entry_offset = offset;
        _consumer.reset();
    }
 };
--- a/sstables/row.cc
+++ b/sstables/row.cc
@@ -374,7 +374,7 @@ public:
        }
    }

-    void reset() {
+    void reset(uint64_t offset) {
        _state = state::ROW_START;
        _consumer.reset();
    }
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -741,10 +741,10 @@ future<> sstable::read_toc() {
                    continue;
                }
                try {
-                   _components.insert(reverse_map(c, _component_map));
+                    _components.insert(reverse_map(c, _component_map));
                } catch (std::out_of_range& oor) {
-                    _components.clear(); // so subsequent read_toc will be forced to fail again
-                    throw malformed_sstable_exception("Unrecognized TOC component: " + c, file_path);
+                    _unrecognized_components.push_back(c);
+                    sstlog.info("Unrecognized TOC component was found: {} in sstable {}", c, file_path);
                }
            }
            if (!_components.size()) {
@@ -1867,8 +1867,8 @@ future<> sstable::generate_summary(const io_priority_class& pc) {
        bool should_continue() {
            return true;
        }
-        void consume_entry(index_entry&& ie) {
-            maybe_add_summary_entry(_summary, ie.get_key_bytes(), ie.position());
+        void consume_entry(index_entry&& ie, uint64_t offset) {
+            maybe_add_summary_entry(_summary, ie.get_key_bytes(), offset);
            if (!first_key) {
                first_key = key(to_bytes(ie.get_key_bytes()));
            } else {
@@ -1957,6 +1957,28 @@ const sstring sstable::filename(sstring dir, sstring ks, sstring cf, version_typ
    return dir + "/" + strmap[version](entry_descriptor(ks, cf, version, generation, format, component));
 }

+const sstring sstable::filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation,
+                                format_types format, sstring component) {
+    static std::unordered_map<version_types, const char*, enum_hash<version_types>> fmtmap = {
+        { sstable::version_types::ka, "{0}-{1}-{2}-{3}-{5}" },
+        { sstable::version_types::la, "{2}-{3}-{4}-{5}" }
+    };
+
+    return dir + "/" + seastar::format(fmtmap[version], ks, cf, _version_string.at(version), to_sstring(generation), _format_string.at(format), component);
+}
+
+std::vector<std::pair<sstable::component_type, sstring>> sstable::all_components() const {
+    std::vector<std::pair<component_type, sstring>> all;
+    all.reserve(_components.size() + _unrecognized_components.size());
+    for (auto& c : _components) {
+        all.push_back(std::make_pair(c, _component_map.at(c)));
+    }
+    for (auto& c : _unrecognized_components) {
+        all.push_back(std::make_pair(component_type::Unknown, c));
+    }
+    return all;
+}
+
 future<> sstable::create_links(sstring dir, int64_t generation) const {
    // TemporaryTOC is always first, TOC is always last
    auto dst = sstable::filename(dir, _schema->ks_name(), _schema->cf_name(), _version, generation, _format, component_type::TemporaryTOC);
@@ -1964,12 +1986,13 @@ future<> sstable::create_links(sstring dir, int64_t generation) const {
        return sstable_write_io_check(sync_directory, dir);
    }).then([this, dir, generation] {
        // FIXME: Should clean already-created links if we failed midway.
-        return parallel_for_each(_components, [this, dir, generation] (auto comp) {
-            if (comp == component_type::TOC) {
+        return parallel_for_each(all_components(), [this, dir, generation] (auto p) {
+            if (p.first == component_type::TOC) {
                return make_ready_future<>();
            }
-            auto dst = sstable::filename(dir, _schema->ks_name(), _schema->cf_name(), _version, generation, _format, comp);
-            return sstable_write_io_check(::link_file, this->filename(comp), dst);
+            auto src = sstable::filename(_dir, _schema->ks_name(), _schema->cf_name(), _version, _generation, _format, p.second);
+            auto dst = sstable::filename(dir, _schema->ks_name(), _schema->cf_name(), _version, generation, _format, p.second);
+            return sstable_write_io_check(::link_file, std::move(src), std::move(dst));
        });
    }).then([dir] {
        return sstable_write_io_check(sync_directory, dir);
@@ -1989,11 +2012,11 @@ future<> sstable::set_generation(int64_t new_generation) {
        return remove_file(filename(component_type::TOC)).then([this] {
            return sstable_write_io_check(sync_directory, _dir);
        }).then([this] {
-            return parallel_for_each(_components, [this] (auto comp) {
-                if (comp == component_type::TOC) {
+            return parallel_for_each(all_components(), [this] (auto p) {
+                if (p.first == component_type::TOC) {
                    return make_ready_future<>();
                }
-                return remove_file(this->filename(comp));
+                return remove_file(sstable::filename(_dir, _schema->ks_name(), _schema->cf_name(), _version, _generation, _format, p.second));
            });
        });
    }).then([this, new_generation] {
@@ -2047,7 +2070,11 @@ sstable::format_types sstable::format_from_sstring(sstring &s) {
 }

 sstable::component_type sstable::component_from_sstring(sstring &s) {
-    return reverse_map(s, _component_map);
+    try {
+        return reverse_map(s, _component_map);
+    } catch (std::out_of_range&) {
+        return component_type::Unknown;
+    }
 }

 input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_priority_class& pc, lw_shared_ptr<file_input_stream_history> history) {
@@ -2240,8 +2267,11 @@ remove_by_toc_name(sstring sstable_toc_name) {
            dir = dirname(sstable_toc_name);
            sstable_write_io_check(rename_file, sstable_toc_name, new_toc_name).get();
            sstable_write_io_check(fsync_directory, dir).get();
-        } else {
+        } else if (sstable_write_io_check(file_exists, new_toc_name).get0()) {
            dir = dirname(new_toc_name);
+        } else {
+            sstlog.warn("Unable to delete {} because it doesn't exist.", sstable_toc_name);
+            return;
        }

        auto toc_file = open_checked_file_dma(sstable_read_error, new_toc_name, open_flags::ro).get0();
@@ -2427,107 +2457,21 @@ operator<<(std::ostream& os, const sstable_to_delete& std) {
    return os << std.name << "(" << (std.shared ? "shared" : "unshared") << ")";
 }

-using shards_agreeing_to_delete_sstable_type = std::unordered_set<shard_id>;
-using sstables_to_delete_atomically_type = std::set<sstring>;
-struct pending_deletion {
-    sstables_to_delete_atomically_type names;
-    std::vector<lw_shared_ptr<promise<>>> completions;
-};
-
-static thread_local bool g_atomic_deletions_cancelled = false;
-static thread_local std::list<lw_shared_ptr<pending_deletion>> g_atomic_deletion_sets;
-static thread_local std::unordered_map<sstring, shards_agreeing_to_delete_sstable_type> g_shards_agreeing_to_delete_sstable;
-
-static logging::logger deletion_logger("sstable-deletion");
-
-static
 future<>
-do_delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set, unsigned deleting_shard) {
-    // runs on shard 0 only
-    deletion_logger.debug("shard {} atomically deleting {}", deleting_shard, atomic_deletion_set);
-
-    if (g_atomic_deletions_cancelled) {
-        deletion_logger.debug("atomic deletions disabled, erroring out");
-        using boost::adaptors::transformed;
-        throw atomic_deletion_cancelled(atomic_deletion_set
-                                        | transformed(std::mem_fn(&sstable_to_delete::name)));
-    }
-
-    // Insert atomic_deletion_set into the list of sets pending deletion.  If the new set
-    // overlaps with an existing set, merge them (the merged set will be deleted atomically).
-    std::list<lw_shared_ptr<pending_deletion>> new_atomic_deletion_sets;
-    auto merged_set = make_lw_shared(pending_deletion());
-    for (auto&& sst_to_delete : atomic_deletion_set) {
-        merged_set->names.insert(sst_to_delete.name);
-        if (!sst_to_delete.shared) {
-            for (auto shard : boost::irange<shard_id>(0, smp::count)) {
-                g_shards_agreeing_to_delete_sstable[sst_to_delete.name].insert(shard);
-            }
-        }
-    }
-    merged_set->completions.push_back(make_lw_shared<promise<>>());
-    auto ret = merged_set->completions.back()->get_future();
-    for (auto&& old_set : g_atomic_deletion_sets) {
-         auto intersection = sstables_to_delete_atomically_type();
-         boost::set_intersection(merged_set->names, old_set->names, std::inserter(intersection, intersection.end()));
-         if (intersection.empty()) {
-             // We copy old_set to avoid corrupting g_atomic_deletion_sets if we fail
-             // further on.
-             new_atomic_deletion_sets.push_back(old_set);
-         } else {
-             deletion_logger.debug("merging with {}", old_set->names);
-             merged_set->names.insert(old_set->names.begin(), old_set->names.end());
-             boost::push_back(merged_set->completions, old_set->completions);
-         }
-    }
-    deletion_logger.debug("new atomic set: {}", merged_set->names);
-    new_atomic_deletion_sets.push_back(merged_set);
-    // can now exception-safely commit:
-    g_atomic_deletion_sets = std::move(new_atomic_deletion_sets);
-
-    // Mark each sstable as being deleted from deleting_shard.  We have to do
-    // this in a separate pass, so the consideration whether we can delete or not
-    // sees all the data from this pass.
-    for (auto&& sst : atomic_deletion_set) {
-        g_shards_agreeing_to_delete_sstable[sst.name].insert(deleting_shard);
-    }
-
-    // Figure out if the (possibly merged) set can be deleted
-    for (auto&& sst : merged_set->names) {
-        if (g_shards_agreeing_to_delete_sstable[sst].size() != smp::count) {
-            // Not everyone agrees, leave the set pending
-            deletion_logger.debug("deferring deletion until all shards agree");
-            return ret;
-        }
-    }
-
-    // Cannot recover from a failed deletion
-    g_atomic_deletion_sets.pop_back();
-    for (auto&& name : merged_set->names) {
-        g_shards_agreeing_to_delete_sstable.erase(name);
-    }
-
-    // Everyone agrees, let's delete
+delete_sstables(std::vector<sstring> tocs) {
    // FIXME: this needs to be done atomically (using a log file of sstables we intend to delete)
-    parallel_for_each(merged_set->names, [] (sstring name) {
-        deletion_logger.debug("deleting {}", name);
+    return parallel_for_each(tocs, [] (sstring name) {
        return remove_by_toc_name(name);
-    }).then_wrapped([merged_set] (future<> result) {
-        deletion_logger.debug("atomic deletion completed: {}", merged_set->names);
-        shared_future<> sf(std::move(result));
-        for (auto&& comp : merged_set->completions) {
-            sf.get_future().forward_to(std::move(*comp));
-        }
    });
-
-    return ret;
 }

+static thread_local atomic_deletion_manager g_atomic_deletion_manager(smp::count, delete_sstables);
+
 future<>
 delete_atomically(std::vector<sstable_to_delete> ssts) {
    auto shard = engine().cpu_id();
    return smp::submit_to(0, [=] {
-        return do_delete_atomically(ssts, shard);
+        return g_atomic_deletion_manager.delete_atomically(ssts, shard);
    });
 }

@@ -2540,16 +2484,8 @@ delete_atomically(std::vector<shared_sstable> ssts) {
    return delete_atomically(std::move(sstables_to_delete_atomically));
 }

-void
-cancel_atomic_deletions() {
-    g_atomic_deletions_cancelled = true;
-    for (auto&& pd : g_atomic_deletion_sets) {
-        for (auto&& c : pd->completions) {
-            c->set_exception(atomic_deletion_cancelled(pd->names));
-        }
-    }
-    g_atomic_deletion_sets.clear();
-    g_shards_agreeing_to_delete_sstable.clear();
+void cancel_atomic_deletions() {
+    g_atomic_deletion_manager.cancel_atomic_deletions();
 }

 atomic_deletion_cancelled::atomic_deletion_cancelled(std::vector<sstring> names)
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -48,6 +48,7 @@
 #include "mutation_reader.hh"
 #include "query-request.hh"
 #include "compound_compat.hh"
+#include "atomic_deletion.hh"

 namespace sstables {

@@ -130,6 +131,7 @@ public:
        Statistics,
        TemporaryTOC,
        TemporaryStatistics,
+        Unknown,
    };
    enum class version_types { ka, la };
    enum class format_types { big };
@@ -221,6 +223,8 @@ public:
    static format_types format_from_sstring(sstring& s);
    static const sstring filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation,
                                  format_types format, component_type component);
+    static const sstring filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation,
+                                  format_types format, sstring component);
    // WARNING: it should only be called to remove components of a sstable with
    // a temporary TOC file.
    static future<> remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64_t generation,
@@ -358,6 +362,8 @@ public:
        return _collector;
    }

+    std::vector<std::pair<component_type, sstring>> all_components() const;
+
    future<> create_links(sstring dir, int64_t generation) const;

    future<> create_links(sstring dir) const {
@@ -394,6 +400,7 @@ private:
    static std::unordered_map<component_type, sstring, enum_hash<component_type>> _component_map;

    std::unordered_set<component_type, enum_hash<component_type>> _components;
+    std::vector<sstring> _unrecognized_components;

    bool _shared = true;  // across shards; safe default
    compression _compression;
@@ -688,14 +695,6 @@ future<> await_background_jobs();
 // Invokes await_background_jobs() on all shards
 future<> await_background_jobs_on_all_shards();

-struct sstable_to_delete {
-    sstable_to_delete(sstring name, bool shared) : name(std::move(name)), shared(shared) {}
-    sstring name;
-    bool shared = false;
-    friend std::ostream& operator<<(std::ostream& os, const sstable_to_delete& std);
-};
-
-
 // When we compact sstables, we have to atomically instantiate the new
 // sstable and delete the old ones.  Otherwise, if we compact A+B into C,
 // and if A contained some data that was tombstoned by B, and if B was
@@ -714,17 +713,6 @@ struct sstable_to_delete {
 future<> delete_atomically(std::vector<shared_sstable> ssts);
 future<> delete_atomically(std::vector<sstable_to_delete> ssts);

-class atomic_deletion_cancelled : public std::exception {
-    std::string _msg;
-public:
-    explicit atomic_deletion_cancelled(std::vector<sstring> names);
-    template <typename StringRange>
-    explicit atomic_deletion_cancelled(StringRange range)
-            : atomic_deletion_cancelled(std::vector<sstring>{range.begin(), range.end()}) {
-    }
-    const char* what() const noexcept override;
-};
-
 // Cancel any deletions scheduled by delete_atomically() and make their
 // futures complete (with an atomic_deletion_cancelled exception).
 void cancel_atomic_deletions();
--- a/test.py
+++ b/test.py
@@ -39,6 +39,7 @@ boost_tests = [
    'storage_proxy_test',
    'schema_change_test',
    'sstable_mutation_test',
+    'sstable_atomic_deletion_test',
    'commitlog_test',
    'hash_test',
    'test-serialization',
--- a/tests/commitlog_test.cc
+++ b/tests/commitlog_test.cc
@@ -53,8 +53,10 @@ static future<> cl_test(commitlog::config cfg, Func && f) {
    cfg.commit_log_location = tmp.path;
    return commitlog::create_commitlog(cfg).then([f = std::forward<Func>(f)](commitlog log) mutable {
        return do_with(std::move(log), [f = std::forward<Func>(f)](commitlog& log) {
-            return futurize<std::result_of_t<Func(commitlog&)>>::apply(f, log).finally([&log] {
-                return log.clear();
+            return futurize_apply(f, log).finally([&log] {
+                return log.shutdown().then([&log] {
+                    return log.clear();
+                });
            });
        });
    }).finally([tmp = std::move(tmp)] {
@@ -277,8 +279,24 @@ SEASTAR_TEST_CASE(test_commitlog_delete_when_over_disk_limit) {
 }

 SEASTAR_TEST_CASE(test_commitlog_reader){
+    static auto count_mutations_in_segment = [] (sstring path) -> future<size_t> {
+        auto count = make_lw_shared<size_t>(0);
+        return db::commitlog::read_log_file(path, [count](temporary_buffer<char> buf, db::replay_position rp) {
+            sstring str(buf.get(), buf.size());
+            BOOST_CHECK_EQUAL(str, "hej bubba cow");
+            (*count)++;
+            return make_ready_future<>();
+        }).then([](auto s) {
+            return do_with(std::move(s), [](auto& s) {
+                return s->done();
+            });
+        }).then([count] {
+            return *count;
+        });
+    };
    commitlog::config cfg;
    cfg.commitlog_segment_size_in_mb = 1;
+    logging::logger_registry().set_logger_level("commitlog", seastar::log_level::trace);
    return cl_test(cfg, [](commitlog& log) {
            auto set = make_lw_shared<std::set<segment_id_type>>();
            auto count = make_lw_shared<size_t>(0);
@@ -309,18 +327,19 @@ SEASTAR_TEST_CASE(test_commitlog_reader){
                        if (i == segments.end()) {
                            throw std::runtime_error("Did not find expected log file");
                        }
-                        return db::commitlog::read_log_file(*i, [count2](temporary_buffer<char> buf, db::replay_position rp) {
-                                    sstring str(buf.get(), buf.size());
-                                    BOOST_CHECK_EQUAL(str, "hej bubba cow");
-                                    (*count2)++;
-                                    return make_ready_future<>();
-                                }).then([](auto s) {
-                                    return do_with(std::move(s), [](auto& s) {
-                                        return s->done();
-                                    });
+                        return *i;
+                    }).then([&log, count] (sstring segment_path) {
+                        // Check reading from an unsynced segment
+                        return count_mutations_in_segment(segment_path).then([count] (size_t replay_count) {
+                            BOOST_CHECK_GE(*count, replay_count);
+                        }).then([&log, count, segment_path] {
+                            return log.sync_all_segments().then([count, segment_path] {
+                                // Check reading from a synced segment
+                                return count_mutations_in_segment(segment_path).then([count] (size_t replay_count) {
+                                    BOOST_CHECK_EQUAL(*count, replay_count);
                                });
-                    }).then([count, count2] {
-                        BOOST_CHECK_EQUAL(*count, *count2);
+                            });
+                        });
                    });
        });
 }
--- a/tests/memtable_test.cc
+++ b/tests/memtable_test.cc
@@ -141,7 +141,7 @@ SEASTAR_TEST_CASE(test_virtual_dirty_accounting_on_flush) {
                .with_column("col", bytes_type, column_kind::regular_column)
                .build();

-        memtable_dirty_memory_manager mgr;
+        dirty_memory_manager mgr;

        auto mt = make_lw_shared<memtable>(s, &mgr.region_group());

@@ -279,7 +279,7 @@ SEASTAR_TEST_CASE(test_segment_migration_during_flush) {
                .with_column("col", bytes_type, column_kind::regular_column)
                .build();

-        memtable_dirty_memory_manager mgr;
+        dirty_memory_manager mgr;

        auto mt = make_lw_shared<memtable>(s, &mgr.region_group());

--- a/tests/sstable_atomic_deletion_test.cc
+++ b/tests/sstable_atomic_deletion_test.cc
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2015 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "sstables/atomic_deletion.hh"
+#include <seastar/tests/test-utils.hh>
+#include <deque>
+#include <boost/range/numeric.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+
+using namespace sstables;
+
+
+class atomic_deletion_test_env {
+public:
+    using event = std::function<future<> (atomic_deletion_test_env& adm)>;
+private:
+    struct a_hash {
+        size_t operator()(const std::unordered_set<sstring>& s) const {
+            auto h = std::hash<sstring>();
+            return boost::accumulate(s | boost::adaptors::transformed(h), size_t(0)); // sue me
+        }
+    };
+    atomic_deletion_manager _adm;
+    std::deque<event> _events;
+    std::unordered_set<std::unordered_set<sstring>, a_hash> _deletes;
+    semaphore _deletion_counter { 0 };
+private:
+    future<> delete_sstables(std::vector<sstring> names) {
+        auto&& s1 = boost::copy_range<std::unordered_set<sstring>>(names);
+        _deletes.insert(s1);
+        _deletion_counter.signal();
+        return make_ready_future<>();
+    }
+public:
+    explicit atomic_deletion_test_env(unsigned shard_count, std::vector<event> events)
+            : _adm(shard_count, [this] (std::vector<sstring> names) {
+                    return delete_sstables(names);
+               })
+            , _events(events.begin(), events.end()) {
+    }
+    void expect_no_deletion() {
+        BOOST_REQUIRE(_deletes.empty());
+    }
+    future<> schedule_delete(std::vector<sstable_to_delete> names, unsigned shard) {
+        _adm.delete_atomically(names, shard).discard_result();
+        return make_ready_future<>();
+    }
+    future<> expect_deletion(std::vector<sstring> names) {
+        return _deletion_counter.wait().then([this, names] {
+            auto&& s1 = boost::copy_range<std::unordered_set<sstring>>(names);
+            auto erased = _deletes.erase(s1);
+            BOOST_REQUIRE_EQUAL(erased, 1);
+        });
+    }
+    future<> test() {
+        // run all _events sequentially
+        return repeat([this] {
+            if (_events.empty()) {
+                return make_ready_future<stop_iteration>(stop_iteration::yes);
+            }
+            auto ev = std::move(_events.front());
+            _events.pop_front();
+            return ev(*this).then([] {
+                return stop_iteration::no;
+            });
+        });
+    }
+};
+
+future<> test_atomic_deletion_manager(unsigned shards, std::vector<atomic_deletion_test_env::event> events) {
+    auto env = make_lw_shared<atomic_deletion_test_env>(shards, events);
+    return env->test().finally([env] {});
+}
+
+atomic_deletion_test_env::event
+delete_many(std::vector<sstable_to_delete> v, unsigned shard) {
+    return [v, shard] (atomic_deletion_test_env& env) {
+        // verify we didn't have an early delete from previous deletion
+        env.expect_no_deletion();
+        return env.schedule_delete(v, shard);
+    };
+}
+
+atomic_deletion_test_env::event
+delete_one(sstable_to_delete s, unsigned shard) {
+    return delete_many({s}, shard);
+}
+
+atomic_deletion_test_env::event
+expect_many(std::vector<sstring> names) {
+    return [names] (atomic_deletion_test_env& env) {
+        return env.expect_deletion(names);
+    };
+}
+
+atomic_deletion_test_env::event
+expect_one(sstring name) {
+    return expect_many({name});
+}
+
+SEASTAR_TEST_CASE(test_single_shard_single_sstable) {
+    return test_atomic_deletion_manager(1, {
+            delete_one({"1", false}, 0),
+            expect_one("1"),
+            delete_one({"2", true}, 0),
+            expect_one("2"),
+    });
+}
+
+SEASTAR_TEST_CASE(test_multi_shard_single_sstable) {
+    return test_atomic_deletion_manager(3, {
+            delete_one({"1", true}, 0),
+            delete_one({"1", true}, 1),
+            delete_one({"1", true}, 2),
+            expect_one("1"),
+            delete_one({"2", false}, 1),
+            expect_one("2"),
+    });
+}
+
+SEASTAR_TEST_CASE(test_nonshared_compaction) {
+    return test_atomic_deletion_manager(5, {
+            delete_many({{"1", false}, {"2", false}, {"3", false}}, 2),
+            expect_many({"1", "2", "3"}),
+    });
+}
+
+SEASTAR_TEST_CASE(test_shared_compaction) {
+    return test_atomic_deletion_manager(3, {
+            delete_one({"1", true}, 0),
+            delete_many({{"1", true}, {"2", false}, {"3", false}}, 2),
+            delete_one({"1", true}, 1),
+            expect_many({"1", "2", "3"}),
+    });
+}
+
+SEASTAR_TEST_CASE(test_overlapping_compaction) {
+    return test_atomic_deletion_manager(3, {
+            delete_one({"1", true}, 0),
+            delete_one({"3", true}, 0),
+            delete_many({{"1", true}, {"2", false}, {"3", true}}, 2),
+            delete_one({"1", true}, 1),
+            delete_many({{"3", true}, {"4", false}}, 1),
+            expect_many({"1", "2", "3", "4"}),
+    });
+}
+
+
+#include "disk-error-handler.hh"
+
+thread_local disk_error_signal_type commit_error;
+thread_local disk_error_signal_type general_disk_error;
--- a/tests/sstable_datafile_test.cc
+++ b/tests/sstable_datafile_test.cc
@@ -3031,3 +3031,22 @@ SEASTAR_TEST_CASE(test_partition_skipping) {
            .produces_end_of_stream();
    });
 }
+
+SEASTAR_TEST_CASE(test_unknown_component) {
+    return seastar::async([] {
+        auto tmp = make_lw_shared<tmpdir>();
+        auto sstp = reusable_sst(uncompressed_schema(), "tests/sstables/unknown_component", 1).get0();
+        sstp->create_links(tmp->path).get();
+        // check that create_links() moved unknown component to new dir
+        BOOST_REQUIRE(file_exists(tmp->path + "/la-1-big-UNKNOWN.txt").get0());
+
+        sstp = reusable_sst(uncompressed_schema(), tmp->path, 1).get0();
+        sstp->set_generation(2).get();
+        BOOST_REQUIRE(!file_exists(tmp->path +  "/la-1-big-UNKNOWN.txt").get0());
+        BOOST_REQUIRE(file_exists(tmp->path + "/la-2-big-UNKNOWN.txt").get0());
+
+        sstables::delete_atomically({sstp}).get();
+        // assure unknown component is deleted
+        BOOST_REQUIRE(!file_exists(tmp->path + "/la-2-big-UNKNOWN.txt").get0());
+    });
+}
--- a/tests/sstables/unknown_component/la-1-big-CRC.db
+++ b/tests/sstables/unknown_component/la-1-big-CRC.db
--- a/tests/sstables/unknown_component/la-1-big-Data.db
+++ b/tests/sstables/unknown_component/la-1-big-Data.db
--- a/tests/sstables/unknown_component/la-1-big-Digest.sha1
+++ b/tests/sstables/unknown_component/la-1-big-Digest.sha1
@@ -0,0 +1 @@
+748507322
--- a/tests/sstables/unknown_component/la-1-big-Filter.db
+++ b/tests/sstables/unknown_component/la-1-big-Filter.db
--- a/tests/sstables/unknown_component/la-1-big-Index.db
+++ b/tests/sstables/unknown_component/la-1-big-Index.db
--- a/tests/sstables/unknown_component/la-1-big-Statistics.db
+++ b/tests/sstables/unknown_component/la-1-big-Statistics.db
--- a/tests/sstables/unknown_component/la-1-big-Summary.db
+++ b/tests/sstables/unknown_component/la-1-big-Summary.db
--- a/tests/sstables/unknown_component/la-1-big-TOC.txt
+++ b/tests/sstables/unknown_component/la-1-big-TOC.txt
@@ -0,0 +1,9 @@
+Data.db
+Filter.db
+CRC.db
+Statistics.db
+Summary.db
+Digest.sha1
+Index.db
+TOC.txt
+UNKNOWN.txt
--- a/tests/sstables/unknown_component/la-1-big-UNKNOWN.txt
+++ b/tests/sstables/unknown_component/la-1-big-UNKNOWN.txt
--- a/thrift/handler.cc
+++ b/thrift/handler.cc
@@ -1450,12 +1450,13 @@ private:
    class column_visitor : public Aggregator {
        const schema& _s;
        const query::partition_slice& _slice;
-        uint32_t _cell_limit;
+        const uint32_t _cell_limit;
+        uint32_t _current_cell_limit;
        std::vector<std::pair<std::string, typename Aggregator::type>> _aggregation;
        typename Aggregator::type* _current_aggregation;
    public:
        column_visitor(const schema& s, const query::partition_slice& slice, uint32_t cell_limit)
-                : _s(s), _slice(slice), _cell_limit(cell_limit)
+                : _s(s), _slice(slice), _cell_limit(cell_limit), _current_cell_limit(0)
        { }
        std::vector<std::pair<std::string, typename Aggregator::type>>&& release() {
            return std::move(_aggregation);
@@ -1468,6 +1469,7 @@ private:
        void accept_new_partition(const partition_key& key, uint32_t row_count) {
            _aggregation.emplace_back(partition_key_to_string(_s, key), typename Aggregator::type());
            _current_aggregation = &_aggregation.back().second;
+            _current_cell_limit = _cell_limit;
        }
        void accept_new_partition(uint32_t row_count) {
            // We always ask for the partition_key to be sent in query_opts().
@@ -1476,19 +1478,19 @@ private:
        void accept_new_row(const clustering_key_prefix& key, const query::result_row_view& static_row, const query::result_row_view& row) {
            auto it = row.iterator();
            auto cell = it.next_atomic_cell();
-            if (cell && _cell_limit > 0) {
+            if (cell && _current_cell_limit > 0) {
                bytes column_name = composite::serialize_value(key.components(), _s.thrift().has_compound_comparator());
                Aggregator::on_column(_current_aggregation, column_name, *cell);
-                _cell_limit -= 1;
+                _current_cell_limit -= 1;
            }
        }
        void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) {
            auto it = row.iterator();
            for (auto&& id : _slice.regular_columns) {
                auto cell = it.next_atomic_cell();
-                if (cell && _cell_limit > 0) {
+                if (cell && _current_cell_limit > 0) {
                    Aggregator::on_column(_current_aggregation, _s.regular_column_at(id).name(), *cell);
-                    _cell_limit -= 1;
+                    _current_cell_limit -= 1;
                }
            }
        }
--- a/transport/server.cc
+++ b/transport/server.cc
@@ -1504,7 +1504,11 @@ std::vector<char> cql_server::response::compress_lz4(const std::vector<char>& bo
    output[1] = (input_len >> 16) & 0xFF;
    output[2] = (input_len >> 8) & 0xFF;
    output[3] = input_len & 0xFF;
+#ifdef HAVE_LZ4_COMPRESS_DEFAULT
+    auto ret = LZ4_compress_default(input, output + 4, input_len, LZ4_compressBound(input_len));
+#else
    auto ret = LZ4_compress(input, output + 4, input_len);
+#endif
    if (ret == 0) {
        throw std::runtime_error("CQL frame LZ4 compression failure");
    }
--- a/utils/histogram.hh
+++ b/utils/histogram.hh
@@ -39,8 +39,8 @@ class moving_average {
 public:
    moving_average(latency_counter::duration interval, latency_counter::duration tick_interval) :
        _tick_interval(tick_interval) {
-        _alpha = 1 - std::exp(-std::chrono::duration_cast<std::chrono::nanoseconds>(interval).count()/
-                static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(tick_interval).count()));
+        _alpha = 1 - std::exp(-std::chrono::duration_cast<std::chrono::seconds>(tick_interval).count()/
+                static_cast<double>(std::chrono::duration_cast<std::chrono::seconds>(interval).count()));
    }

    void add(uint64_t val = 1) {
@@ -48,7 +48,7 @@ public:
    }

    void update() {
-        double instant_rate = _count / static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(_tick_interval).count());
+        double instant_rate = _count / static_cast<double>(std::chrono::duration_cast<std::chrono::seconds>(_tick_interval).count());
        if (_initialized) {
            _rate += (_alpha * (instant_rate - _rate));
        } else {
@@ -70,7 +70,8 @@ public:
    }
 };

-class ihistogram {
+template <typename Unit>
+class basic_ihistogram {
 public:
    // count holds all the events
    int64_t count;
@@ -84,12 +85,13 @@ public:
    double variance;
    int64_t sample_mask;
    boost::circular_buffer<int64_t> sample;
-    ihistogram(size_t size = 1024, int64_t _sample_mask = 0x80)
+    basic_ihistogram(size_t size = 1024, int64_t _sample_mask = 0x80)
            : count(0), total(0), min(0), max(0), sum(0), started(0), mean(0), variance(0),
              sample_mask(_sample_mask), sample(
                    size) {
    }
-    void mark(int64_t value) {
+    void mark(int64_t ns_value) {
+        auto value = std::chrono::duration_cast<Unit>(std::chrono::nanoseconds(ns_value)).count();
        if (total == 0 || value < min) {
            min = value;
        }
@@ -131,7 +133,7 @@ public:
    /**
     * Set the latency according to the sample rate.
     */
-    ihistogram& set_latency(latency_counter& lc) {
+    basic_ihistogram& set_latency(latency_counter& lc) {
        if (should_sample()) {
            lc.start();
        }
@@ -144,7 +146,7 @@ public:
     * Increment the total number of events without
     * sampling the value.
     */
-    ihistogram& inc() {
+    basic_ihistogram& inc() {
        count++;
        return *this;
    }
@@ -157,7 +159,7 @@ public:
        return a * a;
    }

-    ihistogram& operator +=(const ihistogram& o) {
+    basic_ihistogram& operator +=(const basic_ihistogram& o) {
        if (count == 0) {
            *this = o;
        } else if (o.count > 0) {
@@ -190,14 +192,18 @@ public:
        return mean * count;
    }

-    friend ihistogram operator +(ihistogram a, const ihistogram& b);
+    template <typename U>
+    friend basic_ihistogram<U> operator +(basic_ihistogram<U> a, const basic_ihistogram<U>& b);
 };

-inline ihistogram operator +(ihistogram a, const ihistogram& b) {
+template <typename Unit>
+inline basic_ihistogram<Unit> operator +(basic_ihistogram<Unit> a, const basic_ihistogram<Unit>& b) {
    a += b;
    return a;
 }

+using ihistogram = basic_ihistogram<std::chrono::microseconds>;
+
 struct rate_moving_average {
    uint64_t count = 0;
    double rates[3] = {0};
@@ -222,7 +228,7 @@ class timed_rate_moving_average {
    static constexpr latency_counter::duration tick_interval() {
        return std::chrono::seconds(10);
    }
-    moving_average rates[3] = {{tick_interval(), std::chrono::minutes(1)}, {tick_interval(), std::chrono::minutes(5)}, {tick_interval(), std::chrono::minutes(15)}};
+    moving_average rates[3] = {{std::chrono::minutes(1), tick_interval()}, {std::chrono::minutes(5), tick_interval()}, {std::chrono::minutes(15), tick_interval()}};
    latency_counter::time_point start_time;
    timer<> _timer;

@@ -246,7 +252,7 @@ public:
    rate_moving_average rate() const {
        rate_moving_average res;
        if (_count > 0) {
-            double elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(latency_counter::now() - start_time).count();
+            double elapsed = std::chrono::duration_cast<std::chrono::seconds>(latency_counter::now() - start_time).count();
            res.mean_rate = (_count / elapsed);
        }
        res.count = _count;
--- a/utils/logalloc.hh
+++ b/utils/logalloc.hh
@@ -61,7 +61,9 @@ using eviction_fn = std::function<memory::reclaiming_result()>;
 class region_group_reclaimer {
 protected:
    size_t _threshold;
+    size_t _soft_limit;
    bool _under_pressure = false;
+    bool _under_soft_pressure = false;
    virtual void start_reclaiming() {}
    virtual void stop_reclaiming() {}
 public:
@@ -69,6 +71,24 @@ public:
        return _under_pressure;
    }

+    bool over_soft_limit() const {
+        return _under_soft_pressure;
+    }
+
+    void notify_soft_pressure() {
+        if (!_under_soft_pressure) {
+            _under_soft_pressure = true;
+            start_reclaiming();
+        }
+    }
+
+    void notify_soft_relief() {
+        if (_under_soft_pressure) {
+            _under_soft_pressure = false;
+            stop_reclaiming();
+        }
+    }
+
    void notify_pressure() {
        if (!_under_pressure) {
            _under_pressure = true;
@@ -83,12 +103,21 @@ public:
        }
    }

-    region_group_reclaimer(size_t threshold = std::numeric_limits<size_t>::max()) : _threshold(threshold) {}
+    region_group_reclaimer()
+        : _threshold(std::numeric_limits<size_t>::max()), _soft_limit(std::numeric_limits<size_t>::max()) {}
+    region_group_reclaimer(size_t threshold)
+        : _threshold(threshold), _soft_limit(threshold) {}
+    region_group_reclaimer(size_t threshold, size_t soft)
+        : _threshold(threshold), _soft_limit(soft) {}
+
    virtual ~region_group_reclaimer() {}

    size_t throttle_threshold() const {
        return _threshold;
    }
+    size_t soft_limit_threshold() const {
+        return _soft_limit;
+    }
 };

 // Groups regions for the purpose of statistics.  Can be nested.
@@ -232,6 +261,11 @@ public:
            if (rg->execution_permitted()) {
                rg->release_requests();
            }
+            if (rg->_total_memory >= rg->_reclaimer.soft_limit_threshold()) {
+                rg->_reclaimer.notify_soft_pressure();
+            } else if (rg->_total_memory < rg->_reclaimer.soft_limit_threshold()) {
+                rg->_reclaimer.notify_soft_relief();
+            }
            return stop_iteration::no;
        });
    }