database.cc: Fix compilation error with boost 1.55

Message-Id: <1461067254-526-1-git-send-email-calle@scylladb.com> (cherry picked from commit 9130b0de16)
sstables: Fix compilation error on boost 1.55
2016-05-04 08:42:21 +03:00 · 2016-05-04 08:42:15 +03:00 · 2016-05-02 14:29:15 +03:00 · 2016-04-27 15:07:38 +03:00 · 2016-04-26 10:37:40 +03:00 · 2016-04-25 14:12:33 +03:00
106 changed files with 3274 additions and 803 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=1.0.3

 if test -f version
 then
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -54,9 +54,9 @@ class atomic_cell_or_collection;
 */
 class atomic_cell_type final {
 private:
-    static constexpr int8_t DEAD_FLAGS = 0;
    static constexpr int8_t LIVE_FLAG = 0x01;
    static constexpr int8_t EXPIRY_FLAG = 0x02; // When present, expiry field is present. Set only for live cells
+    static constexpr int8_t REVERT_FLAG = 0x04; // transient flag used to efficiently implement ReversiblyMergeable for atomic cells.
    static constexpr unsigned flags_size = 1;
    static constexpr unsigned timestamp_offset = flags_size;
    static constexpr unsigned timestamp_size = 8;
@@ -67,14 +67,21 @@ private:
    static constexpr unsigned ttl_offset = expiry_offset + expiry_size;
    static constexpr unsigned ttl_size = 4;
 private:
+    static bool is_revert_set(bytes_view cell) {
+        return cell[0] & REVERT_FLAG;
+    }
+    template<typename BytesContainer>
+    static void set_revert(BytesContainer& cell, bool revert) {
+        cell[0] = (cell[0] & ~REVERT_FLAG) | (revert * REVERT_FLAG);
+    }
    static bool is_live(const bytes_view& cell) {
-        return cell[0] != DEAD_FLAGS;
+        return cell[0] & LIVE_FLAG;
    }
    static bool is_live_and_has_ttl(const bytes_view& cell) {
        return cell[0] & EXPIRY_FLAG;
    }
    static bool is_dead(const bytes_view& cell) {
-        return cell[0] == DEAD_FLAGS;
+        return !is_live(cell);
    }
    // Can be called on live and dead cells
    static api::timestamp_type timestamp(const bytes_view& cell) {
@@ -106,7 +113,7 @@ private:
    }
    static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
        managed_bytes b(managed_bytes::initialized_later(), flags_size + timestamp_size + deletion_time_size);
-        b[0] = DEAD_FLAGS;
+        b[0] = 0;
        set_field(b, timestamp_offset, timestamp);
        set_field(b, deletion_time_offset, deletion_time.time_since_epoch().count());
        return b;
@@ -140,8 +147,11 @@ protected:
    ByteContainer _data;
 protected:
    atomic_cell_base(ByteContainer&& data) : _data(std::forward<ByteContainer>(data)) { }
-    atomic_cell_base(const ByteContainer& data) : _data(data) { }
+    friend class atomic_cell_or_collection;
 public:
+    bool is_revert_set() const {
+        return atomic_cell_type::is_revert_set(_data);
+    }
    bool is_live() const {
        return atomic_cell_type::is_live(_data);
    }
@@ -187,10 +197,13 @@ public:
    bytes_view serialize() const {
        return _data;
    }
+    void set_revert(bool revert) {
+        atomic_cell_type::set_revert(_data, revert);
+    }
 };

 class atomic_cell_view final : public atomic_cell_base<bytes_view> {
-    atomic_cell_view(bytes_view data) : atomic_cell_base(data) {}
+    atomic_cell_view(bytes_view data) : atomic_cell_base(std::move(data)) {}
 public:
    static atomic_cell_view from_bytes(bytes_view data) { return atomic_cell_view(data); }

@@ -198,6 +211,11 @@ public:
    friend std::ostream& operator<<(std::ostream& os, const atomic_cell_view& acv);
 };

+class atomic_cell_ref final : public atomic_cell_base<managed_bytes&> {
+public:
+    atomic_cell_ref(managed_bytes& buf) : atomic_cell_base(buf) {}
+};
+
 class atomic_cell final : public atomic_cell_base<managed_bytes> {
    atomic_cell(managed_bytes b) : atomic_cell_base(std::move(b)) {}
 public:
--- a/atomic_cell_hash.hh
+++ b/atomic_cell_hash.hh
@@ -57,3 +57,19 @@ struct appending_hash<atomic_cell_view> {
        }
    }
 };
+
+template<>
+struct appending_hash<atomic_cell> {
+    template<typename Hasher>
+    void operator()(Hasher& h, const atomic_cell& cell) const {
+        feed_hash(h, static_cast<atomic_cell_view>(cell));
+    }
+};
+
+template<>
+struct appending_hash<collection_mutation> {
+    template<typename Hasher>
+    void operator()(Hasher& h, const collection_mutation& cm) const {
+        feed_hash(h, static_cast<collection_mutation_view>(cm));
+    }
+};
--- a/atomic_cell_or_collection.hh
+++ b/atomic_cell_or_collection.hh
@@ -27,6 +27,8 @@

 // A variant type that can hold either an atomic_cell, or a serialized collection.
 // Which type is stored is determined by the schema.
+// Has an "empty" state.
+// Objects moved-from are left in an empty state.
 class atomic_cell_or_collection final {
    managed_bytes _data;
 private:
@@ -36,6 +38,7 @@ public:
    atomic_cell_or_collection(atomic_cell ac) : _data(std::move(ac._data)) {}
    static atomic_cell_or_collection from_atomic_cell(atomic_cell data) { return { std::move(data._data) }; }
    atomic_cell_view as_atomic_cell() const { return atomic_cell_view::from_bytes(_data); }
+    atomic_cell_ref as_atomic_cell_ref() { return { _data }; }
    atomic_cell_or_collection(collection_mutation cm) : _data(std::move(cm.data)) {}
    explicit operator bool() const {
        return !_data.empty();
--- a/configure.py
+++ b/configure.py
@@ -845,8 +845,8 @@ with open(buildfile, 'w') as f:
        for obj in compiles:
            src = compiles[obj]
            gen_headers = list(ragels.keys())
-            gen_headers += ['seastar/build/{}/http/request_parser.hh'.format(mode)]
-            gen_headers += ['seastar/build/{}/http/http_response_parser.hh'.format(mode)]
+            gen_headers += ['seastar/build/{}/gen/http/request_parser.hh'.format(mode)]
+            gen_headers += ['seastar/build/{}/gen/http/http_response_parser.hh'.format(mode)]
            for th in thrifts:
                gen_headers += th.headers('$builddir/{}/gen'.format(mode))
            for g in antlr3_grammars:
@@ -878,10 +878,10 @@ with open(buildfile, 'w') as f:
            for cc in grammar.sources('$builddir/{}/gen'.format(mode)):
                obj = cc.replace('.cpp', '.o')
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
-        f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune: ninja {seastar_deps}\n'
+        f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune seastar/build/{mode}/gen/http/request_parser.hh seastar/build/{mode}/gen/http/http_response_parser.hh: ninja {seastar_deps}\n'
                .format(**locals()))
        f.write('  subdir = seastar\n')
-        f.write('  target = build/{mode}/libseastar.a build/{mode}/apps/iotune/iotune\n'.format(**locals()))
+        f.write('  target = build/{mode}/libseastar.a build/{mode}/apps/iotune/iotune build/{mode}/gen/http/request_parser.hh build/{mode}/gen/http/http_response_parser.hh\n'.format(**locals()))
        f.write(textwrap.dedent('''\
            build build/{mode}/iotune: copy seastar/build/{mode}/apps/iotune/iotune
            ''').format(**locals()))
@@ -895,14 +895,6 @@ with open(buildfile, 'w') as f:
            command = find -name '*.[chS]' -o -name "*.cc" -o -name "*.hh" | cscope -bq -i-
            description = CSCOPE
        build cscope: cscope
-        rule request_parser_hh
-           command = {ninja} -C seastar build/release/gen/http/request_parser.hh build/debug/gen/http/request_parser.hh
-           description = GEN seastar/http/request_parser.hh
-        build seastar/build/release/http/request_parser.hh seastar/build/debug/http/request_parser.hh: request_parser_hh
-        rule http_response_parser_hh
-           command = {ninja} -C seastar build/release/gen/http/http_response_parser.hh build/debug/gen/http/http_response_parser.hh
-           description = GEN seastar/http/http_response_parser.hh
-        build seastar/build/release/http/http_response_parser.hh seastar/build/debug/http/http_response_parser.hh: http_response_parser_hh
        rule clean
            command = rm -rf build
            description = CLEAN
--- a/cql3/selection/selectable.cc
+++ b/cql3/selection/selectable.cc
@@ -52,6 +52,11 @@ selectable::writetime_or_ttl::new_selector_factory(database& db, schema_ptr s, s
    return writetime_or_ttl_selector::new_factory(def->name_as_text(), add_and_get_index(*def, defs), _is_writetime);
 }

+sstring
+selectable::writetime_or_ttl::to_string() const {
+    return sprint("%s(%s)", _is_writetime ? "writetime" : "ttl", _id->to_string());
+}
+
 shared_ptr<selectable>
 selectable::writetime_or_ttl::raw::prepare(schema_ptr s) {
    return make_shared<writetime_or_ttl>(_id->prepare_column_identifier(s), _is_writetime);
@@ -78,6 +83,11 @@ selectable::with_function::new_selector_factory(database& db, schema_ptr s, std:
    return abstract_function_selector::new_factory(std::move(fun), std::move(factories));
 }

+sstring
+selectable::with_function::to_string() const {
+    return sprint("%s(%s)", _function_name.name, join(", ", _args));
+}
+
 shared_ptr<selectable>
 selectable::with_function::raw::prepare(schema_ptr s) {
        std::vector<shared_ptr<selectable>> prepared_args;
@@ -101,7 +111,7 @@ selectable::with_field_selection::new_selector_factory(database& db, schema_ptr
    if (!ut) {
        throw exceptions::invalid_request_exception(
                sprint("Invalid field selection: %s of type %s is not a user type",
-                       "FIXME: selectable" /* FIMXME: _selected */, ut->as_cql3_type()));
+                       _selected->to_string(), factory->new_instance()->get_type()->as_cql3_type()));
    }
    for (size_t i = 0; i < ut->size(); ++i) {
        if (ut->field_name(i) != _field->bytes_) {
@@ -110,7 +120,12 @@ selectable::with_field_selection::new_selector_factory(database& db, schema_ptr
        return field_selector::new_factory(std::move(ut), i, std::move(factory));
    }
    throw exceptions::invalid_request_exception(sprint("%s of type %s has no field %s",
-                                                       "FIXME: selectable" /* FIXME: _selected */, ut->as_cql3_type(), _field));
+                                                       _selected->to_string(), ut->as_cql3_type(), _field));
+}
+
+sstring
+selectable::with_field_selection::to_string() const {
+    return sprint("%s.%s", _selected->to_string(), _field->to_string());
 }

 shared_ptr<selectable>
@@ -126,6 +141,10 @@ selectable::with_field_selection::raw::processes_selection() const {
    return true;
 }

+std::ostream & operator<<(std::ostream &os, const selectable& s) {
+    return os << s.to_string();
+}
+
 }

 }
--- a/cql3/selection/selectable.hh
+++ b/cql3/selection/selectable.hh
@@ -55,6 +55,7 @@ class selectable {
 public:
    virtual ~selectable() {}
    virtual ::shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr schema, std::vector<const column_definition*>& defs) = 0;
+    virtual sstring to_string() const = 0;
 protected:
    static size_t add_and_get_index(const column_definition& def, std::vector<const column_definition*>& defs) {
        auto i = std::find(defs.begin(), defs.end(), &def);
@@ -84,6 +85,8 @@ public:
    class with_field_selection;
 };

+std::ostream & operator<<(std::ostream &os, const selectable& s);
+
 class selectable::with_function : public selectable {
    functions::function_name _function_name;
    std::vector<shared_ptr<selectable>> _args;
@@ -92,17 +95,7 @@ public:
        : _function_name(std::move(fname)), _args(std::move(args)) {
    }

-#if 0
-    @Override
-    public String toString()
-    {
-        return new StrBuilder().append(functionName)
-                               .append("(")
-                               .appendWithSeparators(args, ", ")
-                               .append(")")
-                               .toString();
-    }
-#endif
+    virtual sstring to_string() const override;

    virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;
    class raw : public selectable::raw {
--- a/cql3/selection/selectable_with_field_selection.hh
+++ b/cql3/selection/selectable_with_field_selection.hh
@@ -59,13 +59,7 @@ public:
            : _selected(std::move(selected)), _field(std::move(field)) {
    }

-#if 0
-    @Override
-    public String toString()
-    {
-        return String.format("%s.%s", selected, field);
-    }
-#endif
+    virtual sstring to_string() const override;

    virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;

--- a/cql3/selection/writetime_or_ttl.hh
+++ b/cql3/selection/writetime_or_ttl.hh
@@ -58,13 +58,7 @@ public:
            : _id(std::move(id)), _is_writetime(is_writetime) {
    }

-#if 0
-    @Override
-    public String toString()
-    {
-        return (isWritetime ? "writetime" : "ttl") + "(" + id + ")";
-    }
-#endif
+    virtual sstring to_string() const override;

    virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;

--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -169,26 +169,21 @@ public:
    }
 private:
    future<std::vector<mutation>> get_mutations(distributed<service::storage_proxy>& storage, const query_options& options, bool local, api::timestamp_type now) {
-        struct collector {
-            std::vector<mutation> _result;
-            std::vector<mutation> get() && { return std::move(_result); }
-            void operator()(std::vector<mutation> more) {
-                std::move(more.begin(), more.end(), std::back_inserter(_result));
-            }
-        };
-        auto get_mutations_for_statement = [this, &storage, &options, now, local] (size_t i) {
-            auto&& statement = _statements[i];
-            auto&& statement_options = options.for_statement(i);
-            auto timestamp = _attrs->get_timestamp(now, statement_options);
-            return statement->get_mutations(storage, statement_options, local, timestamp);
-        };
-        // FIXME: origin tries hard to merge mutations to same keyspace, for
-        //        some reason.
-        return map_reduce(
-                boost::make_counting_iterator<size_t>(0),
-                boost::make_counting_iterator<size_t>(_statements.size()),
-                get_mutations_for_statement,
-                collector());
+        // Do not process in parallel because operations like list append/prepend depend on execution order.
+        return do_with(std::vector<mutation>(), [this, &storage, &options, now, local] (auto&& result) {
+            return do_for_each(boost::make_counting_iterator<size_t>(0),
+                               boost::make_counting_iterator<size_t>(_statements.size()),
+                               [this, &storage, &options, now, local, &result] (size_t i) {
+                auto&& statement = _statements[i];
+                auto&& statement_options = options.for_statement(i);
+                auto timestamp = _attrs->get_timestamp(now, statement_options);
+                return statement->get_mutations(storage, statement_options, local, timestamp).then([&result] (auto&& more) {
+                    std::move(more.begin(), more.end(), std::back_inserter(result));
+                });
+            }).then([&result] {
+                return std::move(result);
+            });
+        });
    }

 public:
--- a/database.cc
+++ b/database.cc
@@ -45,7 +45,9 @@
 #include <boost/algorithm/cxx11/all_of.hpp>
 #include <boost/function_output_iterator.hpp>
 #include <boost/range/algorithm/heap_algorithm.hpp>
+#include <boost/range/algorithm/remove_if.hpp>
 #include <boost/range/algorithm/find.hpp>
+#include <boost/range/adaptor/map.hpp>
 #include "frozen_mutation.hh"
 #include "mutation_partition_applier.hh"
 #include "core/do_with.hh"
@@ -85,14 +87,16 @@ public:
 column_family::column_family(schema_ptr schema, config config, db::commitlog& cl, compaction_manager& compaction_manager)
    : _schema(std::move(schema))
    , _config(std::move(config))
-    , _memtables(make_lw_shared(memtable_list{}))
+    , _memtables(make_lw_shared<memtable_list>([this] { return seal_active_memtable(); }, [this] { return new_memtable(); }, _config.max_memtable_size))
+    , _streaming_memtables(_config.enable_disk_writes ?
+        make_lw_shared<memtable_list>([this] { return seal_active_streaming_memtable_delayed(); }, [this] { return new_streaming_memtable(); }, _config.max_memtable_size) :
+        make_lw_shared<memtable_list>([this] { return seal_active_memtable(); }, [this] { return new_memtable(); }, _config.max_memtable_size))
    , _sstables(make_lw_shared<sstable_list>())
    , _cache(_schema, sstables_as_mutation_source(), sstables_as_key_source(), global_cache_tracker())
    , _commitlog(&cl)
    , _compaction_manager(compaction_manager)
    , _flush_queue(std::make_unique<memtable_flush_queue>())
 {
-    add_memtable();
    if (!_config.enable_disk_writes) {
        dblog.warn("Writes disabled, column family no durable.");
    }
@@ -101,14 +105,16 @@ column_family::column_family(schema_ptr schema, config config, db::commitlog& cl
 column_family::column_family(schema_ptr schema, config config, no_commitlog cl, compaction_manager& compaction_manager)
    : _schema(std::move(schema))
    , _config(std::move(config))
-    , _memtables(make_lw_shared(memtable_list{}))
+    , _memtables(make_lw_shared<memtable_list>([this] { return seal_active_memtable(); }, [this] { return new_memtable(); }, _config.max_memtable_size))
+    , _streaming_memtables(_config.enable_disk_writes ?
+        make_lw_shared<memtable_list>([this] { return seal_active_streaming_memtable_delayed(); }, [this] { return new_streaming_memtable(); }, _config.max_memtable_size) :
+        make_lw_shared<memtable_list>([this] { return seal_active_memtable(); }, [this] { return new_memtable(); }, _config.max_memtable_size))
    , _sstables(make_lw_shared<sstable_list>())
    , _cache(_schema, sstables_as_mutation_source(), sstables_as_key_source(), global_cache_tracker())
    , _commitlog(nullptr)
    , _compaction_manager(compaction_manager)
    , _flush_queue(std::make_unique<memtable_flush_queue>())
 {
-    add_memtable();
    if (!_config.enable_disk_writes) {
        dblog.warn("Writes disabled, column family no durable.");
    }
@@ -140,7 +146,10 @@ column_family::~column_family() {

 logalloc::occupancy_stats column_family::occupancy() const {
    logalloc::occupancy_stats res;
-    for (auto m : *_memtables.get()) {
+    for (auto m : *_memtables) {
+        res += m->region().occupancy();
+    }
+    for (auto m : *_streaming_memtables) {
        res += m->region().occupancy();
    }
    return res;
@@ -483,8 +492,9 @@ future<sstables::entry_descriptor> column_family::probe_file(sstring sstdir, sst
        }
    }

-    auto fut = sstable::get_sstable_key_range(*_schema, _schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
-    return std::move(fut).then([this, sstdir = std::move(sstdir), comps] (range<partition_key> r) {
+    auto sst = std::make_unique<sstables::sstable>(_schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
+    auto fut = sst->get_sstable_key_range(*_schema);
+    return std::move(fut).then([this, sst = std::move(sst), sstdir = std::move(sstdir), comps] (range<partition_key> r) mutable {
        // Checks whether or not sstable belongs to current shard.
        if (!belongs_to_current_shard(*_schema, std::move(r))) {
            dblog.debug("sstable {} not relevant for this shard, ignoring",
@@ -494,7 +504,6 @@ future<sstables::entry_descriptor> column_family::probe_file(sstring sstdir, sst
            return make_ready_future<>();
        }

-        auto sst = std::make_unique<sstables::sstable>(_schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
        auto fut = sst->load();
        return std::move(fut).then([this, sst = std::move(sst)] () mutable {
            add_sstable(std::move(*sst));
@@ -533,12 +542,15 @@ void column_family::add_sstable(lw_shared_ptr<sstables::sstable> sstable) {
    _sstables->emplace(generation, std::move(sstable));
 }

-void column_family::add_memtable() {
-    // allow in-progress reads to continue using old list
-    _memtables = make_lw_shared(memtable_list(*_memtables));
-    _memtables->emplace_back(make_lw_shared<memtable>(_schema, _config.dirty_memory_region_group));
+lw_shared_ptr<memtable> column_family::new_memtable() {
+    return make_lw_shared<memtable>(_schema, _config.dirty_memory_region_group);
 }

+lw_shared_ptr<memtable> column_family::new_streaming_memtable() {
+    return make_lw_shared<memtable>(_schema, _config.streaming_dirty_memory_region_group);
+}
+
+
 future<>
 column_family::update_cache(memtable& m, lw_shared_ptr<sstable_list> old_sstables) {
    if (_config.enable_cache) {
@@ -550,6 +562,97 @@ column_family::update_cache(memtable& m, lw_shared_ptr<sstable_list> old_sstable
    }
 }

+// FIXME: because we are coalescing, it could be that mutations belonging to the same
+// range end up in two different tables. Technically, we should wait for both. However,
+// the only way we have to make this happen now is to wait on all previous writes. This
+// certainly is an overkill, so we won't do it. We can fix this longer term by looking
+// at the PREPARE messages, and then noting what is the minimum future we should be
+// waiting for.
+future<>
+column_family::seal_active_streaming_memtable_delayed() {
+    auto old = _streaming_memtables->back();
+    if (old->empty()) {
+        return make_ready_future<>();
+    }
+
+    if (_streaming_memtables->should_flush()) {
+        return seal_active_streaming_memtable();
+    }
+
+    if (!_delayed_streaming_flush.armed()) {
+            // We don't want to wait for too long, because the incoming mutations will not be available
+            // until we flush them to SSTables. On top of that, if the sender ran out of messages, it won't
+            // send more until we respond to some - which depends on these futures resolving. Sure enough,
+            // the real fix for that second one is to have better communication between sender and receiver,
+            // but that's not realistic ATM. If we did have better negotiation here, we would not need a timer
+            // at all.
+            _delayed_streaming_flush.arm(2s);
+    }
+
+    return with_gate(_streaming_flush_gate, [this, old] {
+        return _waiting_streaming_flushes.get_shared_future();
+    });
+}
+
+future<>
+column_family::seal_active_streaming_memtable() {
+    auto old = _streaming_memtables->back();
+    if (old->empty()) {
+        return make_ready_future<>();
+    }
+    _streaming_memtables->add_memtable();
+    _streaming_memtables->erase(old);
+    return with_gate(_streaming_flush_gate, [this, old] {
+        _delayed_streaming_flush.cancel();
+
+        auto current_waiters = std::exchange(_waiting_streaming_flushes, shared_promise<>());
+        auto f = current_waiters.get_shared_future(); // for this seal
+
+        with_lock(_sstables_lock.for_read(), [this, old] {
+            auto newtab = make_lw_shared<sstables::sstable>(_schema->ks_name(), _schema->cf_name(),
+                _config.datadir, calculate_generation_for_new_table(),
+                sstables::sstable::version_types::ka,
+                sstables::sstable::format_types::big);
+
+            newtab->set_unshared();
+
+            auto&& priority = service::get_local_streaming_write_priority();
+            // This is somewhat similar to the main memtable flush, but with important differences.
+            //
+            // The first difference, is that we don't keep aggregate collectd statistics about this one.
+            // If we ever need to, we'll keep them separate statistics, but we don't want to polute the
+            // main stats about memtables with streaming memtables.
+            //
+            // Second, we will not bother touching the cache after this flush. The current streaming code
+            // will invalidate the ranges it touches, so we won't do it twice. Even when that changes, the
+            // cache management code in here will have to differ from the main memtable's one. Please see
+            // the comment at flush_streaming_mutations() for details.
+            //
+            // Lastly, we don't have any commitlog RP to update, and we don't need to deal manipulate the
+            // memtable list, since this memtable was not available for reading up until this point.
+            return newtab->write_components(*old, incremental_backups_enabled(), priority).then([this, newtab, old] {
+                return newtab->open_data();
+            }).then([this, old, newtab] () {
+                add_sstable(newtab);
+                trigger_compaction();
+            }).handle_exception([] (auto ep) {
+                dblog.error("failed to write streamed sstable: {}", ep);
+                return make_exception_future<>(ep);
+            });
+            // We will also not have any retry logic. If we fail here, we'll fail the streaming and let
+            // the upper layers know. They can then apply any logic they want here.
+        }).then_wrapped([this, current_waiters = std::move(current_waiters)] (future <> f) mutable {
+            if (f.failed()) {
+                current_waiters.set_exception(f.get_exception());
+            } else {
+                current_waiters.set_value();
+            }
+        });
+
+        return f;
+    });
+}
+
 future<>
 column_family::seal_active_memtable() {
    auto old = _memtables->back();
@@ -563,7 +666,7 @@ column_family::seal_active_memtable() {
        dblog.debug("Memtable is empty");
        return make_ready_future<>();
    }
-    add_memtable();
+    _memtables->add_memtable();

    assert(_highest_flushed_rp < old->replay_position()
    || (_highest_flushed_rp == db::replay_position() && old->replay_position() == db::replay_position())
@@ -637,7 +740,7 @@ column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
                    dblog.error("failed to move memtable to cache: {}", std::current_exception());
                }

-                _memtables->erase(boost::range::find(*_memtables, old));
+                _memtables->erase(old);
                dblog.debug("Memtable replaced");

                return make_ready_future<stop_iteration>(stop_iteration::yes);
@@ -660,28 +763,39 @@ column_family::start() {
 future<>
 column_family::stop() {
    seal_active_memtable();
+    seal_active_streaming_memtable();
    return _compaction_manager.remove(this).then([this] {
-        return _flush_queue->close();
+        // Nest, instead of using when_all, so we don't lose any exceptions.
+        return _flush_queue->close().then([this] {
+            return _streaming_flush_gate.close();
+        });
+    }).then([this] {
+        return _sstable_deletion_gate.close();
    });
 }


 future<std::vector<sstables::entry_descriptor>>
-column_family::reshuffle_sstables(int64_t start) {
+column_family::reshuffle_sstables(std::set<int64_t> all_generations, int64_t start) {
    struct work {
        int64_t current_gen;
+        std::set<int64_t> all_generations; // Stores generation of all live sstables in the system.
        sstable_list sstables;
        std::unordered_map<int64_t, sstables::entry_descriptor> descriptors;
        std::vector<sstables::entry_descriptor> reshuffled;
-        work(int64_t start) : current_gen(start ? start : 1) {}
+        work(int64_t start, std::set<int64_t> gens)
+            : current_gen(start ? start : 1)
+            , all_generations(gens) {}
    };

-    return do_with(work(start), [this] (work& work) {
+    return do_with(work(start, std::move(all_generations)), [this] (work& work) {
        return lister::scan_dir(_config.datadir, { directory_entry_type::regular }, [this, &work] (directory_entry de) {
            auto comps = sstables::entry_descriptor::make_descriptor(de.name);
            if (comps.component != sstables::sstable::component_type::TOC) {
                return make_ready_future<>();
-            } else if (comps.generation < work.current_gen) {
+            }
+            // Skip generations that were already loaded by Scylla at a previous stage.
+            if (work.all_generations.count(comps.generation) != 0) {
                return make_ready_future<>();
            }
            auto sst = make_lw_shared<sstables::sstable>(_schema->ks_name(), _schema->cf_name(),
@@ -719,6 +833,21 @@ column_family::reshuffle_sstables(int64_t start) {
    });
 }

+void column_family::rebuild_statistics() {
+    // zeroing live_disk_space_used and live_sstable_count because the
+    // sstable list was re-created
+    _stats.live_disk_space_used = 0;
+    _stats.live_sstable_count = 0;
+
+    for (auto&& tab : boost::range::join(_sstables_compacted_but_not_deleted,
+                    // this might seem dangerous, but "move" here just avoids constness,
+                    // making the two ranges compatible when compiling with boost 1.55.
+                    // Noone is actually moving anything...
+                                         std::move(*_sstables) | boost::adaptors::map_values)) {
+        update_stats_for_new_sstable(tab->data_size());
+    }
+}
+
 void
 column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>& new_sstables,
                                    const std::vector<sstables::shared_sstable>& sstables_to_remove) {
@@ -727,37 +856,53 @@ column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>&
    // later), and we add the new tables generated by the compaction.
    // We create a new list rather than modifying it in-place, so that
    // on-going reads can continue to use the old list.
+    //
+    // We only remove old sstables after they are successfully deleted,
+    // to avoid a new compaction from ignoring data in the old sstables
+    // if the deletion fails (note deletion of shared sstables can take
+    // unbounded time, because all shards must agree on the deletion).
    auto current_sstables = _sstables;
    auto new_sstable_list = make_lw_shared<sstable_list>();
+    auto new_compacted_but_not_deleted = _sstables_compacted_but_not_deleted;

-    // zeroing live_disk_space_used and live_sstable_count because the
-    // sstable list is re-created below.
-    _stats.live_disk_space_used = 0;
-    _stats.live_sstable_count = 0;

    std::unordered_set<sstables::shared_sstable> s(
           sstables_to_remove.begin(), sstables_to_remove.end());

-    for (const auto& oldtab : *current_sstables) {
+    // First, add the new sstables.
+
+    // this might seem dangerous, but "move" here just avoids constness,
+    // making the two ranges compatible when compiling with boost 1.55.
+    // Noone is actually moving anything...
+    for (auto&& tab : boost::range::join(new_sstables, std::move(*current_sstables) | boost::adaptors::map_values)) {
        // Checks if oldtab is a sstable not being compacted.
-        if (!s.count(oldtab.second)) {
-            update_stats_for_new_sstable(oldtab.second->data_size());
-            new_sstable_list->emplace(oldtab.first, oldtab.second);
+        if (!s.count(tab)) {
+            new_sstable_list->emplace(tab->generation(), tab);
+        } else {
+            new_compacted_but_not_deleted.push_back(tab);
        }
    }
-
-    for (const auto& newtab : new_sstables) {
-        // FIXME: rename the new sstable(s). Verify a rename doesn't cause
-        // problems for the sstable object.
-        update_stats_for_new_sstable(newtab->data_size());
-        new_sstable_list->emplace(newtab->generation(), newtab);
-    }
-
-    for (const auto& oldtab : sstables_to_remove) {
-        oldtab->mark_for_deletion();
-    }
-
    _sstables = std::move(new_sstable_list);
+    _sstables_compacted_but_not_deleted = std::move(new_compacted_but_not_deleted);
+
+    rebuild_statistics();
+
+    // Second, delete the old sstables.  This is done in the background, so we can
+    // consider this compaction completed.
+    seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
+        return sstables::delete_atomically(sstables_to_remove).then([this, sstables_to_remove] {
+            auto current_sstables = _sstables;
+            auto new_sstable_list = make_lw_shared<sstable_list>();
+
+            std::unordered_set<sstables::shared_sstable> s(
+                   sstables_to_remove.begin(), sstables_to_remove.end());
+            auto e = boost::range::remove_if(_sstables_compacted_but_not_deleted, [&] (sstables::shared_sstable sst) -> bool {
+                return s.count(sst);
+            });
+            _sstables_compacted_but_not_deleted.erase(e, _sstables_compacted_but_not_deleted.end());
+            rebuild_statistics();
+        });
+    });
 }

 future<>
@@ -781,7 +926,7 @@ column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool
        };
        return sstables::compact_sstables(*sstables_to_compact, *this, create_sstable, descriptor.max_sstable_bytes, descriptor.level,
                cleanup).then([this, sstables_to_compact] (auto new_sstables) {
-            this->rebuild_sstable_list(new_sstables, *sstables_to_compact);
+            return this->rebuild_sstable_list(new_sstables, *sstables_to_compact);
        });
    });
 }
@@ -912,6 +1057,24 @@ lw_shared_ptr<sstable_list> column_family::get_sstables() {
    return _sstables;
 }

+// Gets the list of all sstables in the column family, including ones that are
+// not used for active queries because they have already been compacted, but are
+// waiting for delete_atomically() to return.
+//
+// As long as we haven't deleted them, compaction needs to ensure it doesn't
+// garbage-collect a tombstone that covers data in an sstable that may not be
+// successfully deleted.
+lw_shared_ptr<sstable_list> column_family::get_sstables_including_compacted_undeleted() {
+    if (_sstables_compacted_but_not_deleted.empty()) {
+        return _sstables;
+    }
+    auto ret = make_lw_shared(*_sstables);
+    for (auto&& s : _sstables_compacted_but_not_deleted) {
+        ret->insert(std::make_pair(s->generation(), s));
+    }
+    return ret;
+}
+
 inline bool column_family::manifest_json_filter(const sstring& fname) {
    using namespace boost::filesystem;

@@ -1027,14 +1190,24 @@ database::database() : database(db::config())
 {}

 database::database(const db::config& cfg)
-    : _cfg(std::make_unique<db::config>(cfg))
+    : _streaming_dirty_memory_region_group(&_dirty_memory_region_group)
+    , _cfg(std::make_unique<db::config>(cfg))
+    , _memtable_total_space([this] {
+        auto memtable_total_space = size_t(_cfg->memtable_total_space_in_mb()) << 20;
+        if (!memtable_total_space) {
+            return memory::stats().total_memory() / 2;
+        }
+        return memtable_total_space;
+    }())
    , _version(empty_version)
    , _enable_incremental_backups(cfg.incremental_backups())
+    , _memtables_throttler(_memtable_total_space, _dirty_memory_region_group)
+    // We have to be careful here not to set the streaming limit for less than
+    // a memtable maximum size. Allow up to 25 % to be used up by streaming memtables
+    // in the common case
+    , _streaming_throttler(_memtable_total_space * std::min(0.25, cfg.memtable_cleanup_threshold()),
+                           _streaming_dirty_memory_region_group, _memtables_throttler)
 {
-    _memtable_total_space = size_t(_cfg->memtable_total_space_in_mb()) << 20;
-    if (!_memtable_total_space) {
-        _memtable_total_space = memory::stats().total_memory() / 2;
-    }
    // Start compaction manager with two tasks for handling compaction jobs.
    _compaction_manager.start(2);
    setup_collectd();
@@ -1424,6 +1597,7 @@ keyspace::make_column_family_config(const schema& s) const {
    cfg.enable_cache = _config.enable_cache;
    cfg.max_memtable_size = _config.max_memtable_size;
    cfg.dirty_memory_region_group = _config.dirty_memory_region_group;
+    cfg.streaming_dirty_memory_region_group = _config.streaming_dirty_memory_region_group;
    cfg.cf_stats = _config.cf_stats;
    cfg.enable_incremental_backups = _config.enable_incremental_backups;

@@ -1599,21 +1773,12 @@ column_family::query(schema_ptr s, const query::read_command& cmd, query::result
    {
        return do_until(std::bind(&query_state::done, &qs), [this, &qs] {
            auto&& range = *qs.current_partition_range++;
-            qs.reader = make_reader(qs.schema, range, service::get_local_sstable_query_read_priority());
-            qs.range_empty = false;
-            return do_until([&qs] { return !qs.limit || qs.range_empty; }, [&qs] {
-                return qs.reader().then([&qs](mutation_opt mo) {
-                    if (mo) {
-                        auto p_builder = qs.builder.add_partition(*mo->schema(), mo->key());
-                        auto is_distinct = qs.cmd.slice.options.contains(query::partition_slice::option::distinct);
-                        auto limit = !is_distinct ? qs.limit : 1;
-                        auto rows_added = mo->partition().query(p_builder, *qs.schema, qs.cmd.timestamp, limit);
-                        qs.limit -= rows_added;
-                    } else {
-                        qs.range_empty = true;
-                    }
-                });
-            });
+            auto add_partition = [&qs] (uint32_t live_rows, mutation&& m) {
+                auto pb = qs.builder.add_partition(*qs.schema, m.key());
+                m.partition().query_compacted(pb, *qs.schema, live_rows);
+            };
+            return do_with(querying_reader(qs.schema, as_mutation_source(), range, qs.cmd.slice, qs.limit, qs.cmd.timestamp, add_partition),
+                           [] (auto&& rd) { return rd.read(); });
        }).then([qs_ptr = std::move(qs_ptr), &qs] {
            return make_ready_future<lw_shared_ptr<query::result>>(
                    make_lw_shared<query::result>(qs.builder.build()));
@@ -1711,8 +1876,8 @@ void
 column_family::apply(const mutation& m, const db::replay_position& rp) {
    utils::latency_counter lc;
    _stats.writes.set_latency(lc);
-    active_memtable().apply(m, rp);
-    seal_on_overflow();
+    _memtables->active_memtable().apply(m, rp);
+    _memtables->seal_on_overflow();
    _stats.writes.mark(lc);
    if (lc.is_start()) {
        _stats.estimated_write.add(lc.latency(), _stats.writes.count);
@@ -1724,21 +1889,17 @@ column_family::apply(const frozen_mutation& m, const schema_ptr& m_schema, const
    utils::latency_counter lc;
    _stats.writes.set_latency(lc);
    check_valid_rp(rp);
-    active_memtable().apply(m, m_schema, rp);
-    seal_on_overflow();
+    _memtables->active_memtable().apply(m, m_schema, rp);
+    _memtables->seal_on_overflow();
    _stats.writes.mark(lc);
    if (lc.is_start()) {
        _stats.estimated_write.add(lc.latency(), _stats.writes.count);
    }
 }

-void
-column_family::seal_on_overflow() {
-    if (active_memtable().occupancy().total_space() >= _config.max_memtable_size) {
-        // FIXME: if sparse, do some in-memory compaction first
-        // FIXME: maybe merge with other in-memory memtables
-        seal_active_memtable();
-    }
+void column_family::apply_streaming_mutation(schema_ptr m_schema, const frozen_mutation& m) {
+    _streaming_memtables->active_memtable().apply(m, m_schema);
+    _streaming_memtables->seal_on_overflow();
 }

 void
@@ -1787,9 +1948,20 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m) {
    return apply_in_memory(m, s, db::replay_position());
 }

-future<> database::throttle() {
-    if (_dirty_memory_region_group.memory_used() < _memtable_total_space
-            && _throttled_requests.empty()) {
+database::throttle_state::throttle_state(size_t max_space, logalloc::region_group& rg)
+    : _max_space(max_space)
+    , _region_group(rg)
+    , _parent(nullptr)
+{}
+
+database::throttle_state::throttle_state(size_t max_space, logalloc::region_group& rg, throttle_state& parent)
+    : _max_space(max_space)
+    , _region_group(rg)
+    , _parent(&parent)
+{}
+
+future<> database::throttle_state::throttle() {
+    if (!should_throttle() && _throttled_requests.empty()) {
        // All is well, go ahead
        return make_ready_future<>();
    }
@@ -1801,13 +1973,13 @@ future<> database::throttle() {
    return _throttled_requests.back().get_future();
 }

-void database::unthrottle() {
+void database::throttle_state::unthrottle() {
    // Release one request per free 1MB we have
    // FIXME: improve this
-    if (_dirty_memory_region_group.memory_used() >= _memtable_total_space) {
+    if (should_throttle()) {
        return;
    }
-    size_t avail = (_memtable_total_space - _dirty_memory_region_group.memory_used()) >> 20;
+    size_t avail = (_max_space - _region_group.memory_used()) >> 20;
    avail = std::min(_throttled_requests.size(), avail);
    for (size_t i = 0; i < avail; ++i) {
        _throttled_requests.front().set_value();
@@ -1822,11 +1994,39 @@ future<> database::apply(schema_ptr s, const frozen_mutation& m) {
    if (dblog.is_enabled(logging::log_level::trace)) {
        dblog.trace("apply {}", m.pretty_printer(s));
    }
-    return throttle().then([this, &m, s = std::move(s)] {
+    return _memtables_throttler.throttle().then([this, &m, s = std::move(s)] {
        return do_apply(std::move(s), m);
    });
 }

+future<> database::apply_streaming_mutation(schema_ptr s, const frozen_mutation& m) {
+    if (!s->is_synced()) {
+        throw std::runtime_error(sprint("attempted to mutate using not synced schema of %s.%s, version=%s",
+                                 s->ks_name(), s->cf_name(), s->version()));
+    }
+
+    // TODO (maybe): This will use the same memory region group as memtables, so when
+    // one of them throttles, both will.
+    //
+    // It would be possible to provide further QoS for CQL originated memtables
+    // by keeping the streaming memtables into a different region group, with its own
+    // separate limit.
+    //
+    // Because, however, there are many other limits in play that may kick in,
+    // I am not convinced that this will ever be a problem.
+    //
+    // If we do find ourselves in the situation that we are throttling incoming
+    // writes due to high level of streaming writes, and we are sure that this
+    // is the best solution, we can just change the memtable creation method so
+    // that each kind of memtable creates from a different region group - and then
+    // update the throttle conditions accordingly.
+    return _streaming_throttler.throttle().then([this, &m, s = std::move(s)] {
+        auto uuid = m.column_family_id();
+        auto& cf = find_column_family(uuid);
+        cf.apply_streaming_mutation(s, std::move(m));
+    });
+}
+
 keyspace::config
 database::make_keyspace_config(const keyspace_metadata& ksm) {
    // FIXME support multiple directories
@@ -1847,6 +2047,7 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        cfg.max_memtable_size = std::numeric_limits<size_t>::max();
    }
    cfg.dirty_memory_region_group = &_dirty_memory_region_group;
+    cfg.streaming_dirty_memory_region_group = &_streaming_dirty_memory_region_group;
    cfg.cf_stats = &_cf_stats;
    cfg.enable_incremental_backups = _enable_incremental_backups;
    return cfg;
@@ -2299,10 +2500,36 @@ future<> column_family::flush(const db::replay_position& pos) {
    return seal_active_memtable();
 }

+// FIXME: We can do much better than this in terms of cache management. Right
+// now, we only have to flush the touched ranges because of the possibility of
+// streaming containing token ownership changes.
+//
+// Right now we can't differentiate between that and a normal repair process,
+// so we always flush. When we can differentiate those streams, we should not
+// be indiscriminately touching the cache during repair. We will just have to
+// invalidate the entries that are relevant to things we already have in the cache.
+future<> column_family::flush_streaming_mutations(std::vector<query::partition_range> ranges) {
+    // This will effectively take the gate twice for this call. The proper way to fix that would
+    // be to change seal_active_streaming_memtable_delayed to take a range parameter. However, we
+    // need this code to go away as soon as we can (see FIXME above). So the double gate is a better
+    // temporary counter measure.
+    return with_gate(_streaming_flush_gate, [this, ranges = std::move(ranges)] {
+        return seal_active_streaming_memtable_delayed().finally([this, ranges = std::move(ranges)] {
+            if (_config.enable_cache) {
+                for (auto& range : ranges) {
+                    _cache.invalidate(range);
+                }
+            }
+        });
+    });
+}
+
 void column_family::clear() {
    _cache.clear();
    _memtables->clear();
-    add_memtable();
+    _memtables->add_memtable();
+    _streaming_memtables->clear();
+    _streaming_memtables->add_memtable();
 }

 // NOTE: does not need to be futurized, but might eventually, depending on
@@ -2316,21 +2543,26 @@ future<db::replay_position> column_family::discard_sstables(db_clock::time_point
        auto gc_trunc = to_gc_clock(truncated_at);

        auto pruned = make_lw_shared<sstable_list>();
+        std::vector<sstables::shared_sstable> remove;

        for (auto&p : *_sstables) {
            if (p.second->max_data_age() <= gc_trunc) {
                rp = std::max(p.second->get_stats_metadata().position, rp);
-                p.second->mark_for_deletion();
+                remove.emplace_back(p.second);
                continue;
            }
            pruned->emplace(p.first, p.second);
        }

        _sstables = std::move(pruned);
-
        dblog.debug("cleaning out row cache");
        _cache.clear();
-        return make_ready_future<db::replay_position>(rp);
+
+        return parallel_for_each(remove, [](sstables::shared_sstable s) {
+            return sstables::delete_atomically({s});
+        }).then([rp] {
+            return make_ready_future<db::replay_position>(rp);
+        }).finally([remove] {}); // keep the objects alive until here.
    });
 }

@@ -2376,6 +2608,10 @@ void column_family::set_schema(schema_ptr s) {
        m->set_schema(s);
    }

+    for (auto& m : *_streaming_memtables) {
+        m->set_schema(s);
+    }
+
    _cache.set_schema(s);
    _schema = std::move(s);
 }
--- a/database.hh
+++ b/database.hh
@@ -41,6 +41,7 @@
 #include <set>
 #include <iostream>
 #include <boost/functional/hash.hpp>
+#include <boost/range/algorithm/find.hpp>
 #include <experimental/optional>
 #include <string.h>
 #include "types.hh"
@@ -70,6 +71,7 @@
 #include "sstables/compaction.hh"
 #include "key_reader.hh"
 #include <seastar/core/rwlock.hh>
+#include <seastar/core/shared_future.hh>

 class frozen_mutation;
 class reconcilable_result;
@@ -98,7 +100,96 @@ void make(database& db, bool durable, bool volatile_testing_only);

 class replay_position_reordered_exception : public std::exception {};

-using memtable_list = std::vector<lw_shared_ptr<memtable>>;
+// We could just add all memtables, regardless of types, to a single list, and
+// then filter them out when we read them. Here's why I have chosen not to do
+// it:
+//
+// First, some of the methods in which a memtable is involved (like seal) are
+// assume a commitlog, and go through great care of updating the replay
+// position, flushing the log, etc.  We want to bypass those, and that has to
+// be done either by sprikling the seal code with conditionals, or having a
+// separate method for each seal.
+//
+// Also, if we ever want to put some of the memtables in as separate allocator
+// region group to provide for extra QoS, having the classes properly wrapped
+// will make that trivial: just pass a version of new_memtable() that puts it
+// in a different region, while the list approach would require a lot of
+// conditionals as well.
+//
+// If we are going to have different methods, better have different instances
+// of a common class.
+class memtable_list {
+    using shared_memtable = lw_shared_ptr<memtable>;
+    std::vector<shared_memtable> _memtables;
+    std::function<future<> ()> _seal_fn;
+    std::function<shared_memtable ()> _new_memtable;
+    size_t _max_memtable_size;
+public:
+    memtable_list(std::function<future<> ()> seal_fn, std::function<shared_memtable()> new_mt, size_t max_memtable_size)
+        : _memtables({})
+        , _seal_fn(seal_fn)
+        , _new_memtable(new_mt)
+        , _max_memtable_size(max_memtable_size) {
+        add_memtable();
+    }
+
+    shared_memtable back() {
+        return _memtables.back();
+    }
+
+    // The caller has to make sure the element exist before calling this.
+    void erase(const shared_memtable& element) {
+        _memtables.erase(boost::range::find(_memtables, element));
+    }
+    void clear() {
+        _memtables.clear();
+    }
+
+    size_t size() const {
+        return _memtables.size();
+    }
+
+    future<> seal_active_memtable() {
+        return _seal_fn();
+    }
+
+    auto begin() noexcept {
+        return _memtables.begin();
+    }
+
+    auto begin() const noexcept {
+        return _memtables.begin();
+    }
+
+    auto end() noexcept {
+        return _memtables.end();
+    }
+
+    auto end() const noexcept {
+        return _memtables.end();
+    }
+
+    memtable& active_memtable() {
+        return *_memtables.back();
+    }
+
+    void add_memtable() {
+        _memtables.emplace_back(_new_memtable());
+    }
+
+    bool should_flush() {
+        return active_memtable().occupancy().total_space() >= _max_memtable_size;
+    }
+
+    void seal_on_overflow() {
+        if (should_flush()) {
+            // FIXME: if sparse, do some in-memory compaction first
+            // FIXME: maybe merge with other in-memory memtables
+            _seal_fn();
+        }
+    }
+};
+
 using sstable_list = sstables::sstable_list;

 // The CF has a "stats" structure. But we don't want all fields here,
@@ -122,6 +213,7 @@ public:
        bool enable_incremental_backups = false;
        size_t max_memtable_size = 5'000'000;
        logalloc::region_group* dirty_memory_region_group = nullptr;
+        logalloc::region_group* streaming_dirty_memory_region_group = nullptr;
        ::cf_stats* cf_stats = nullptr;
    };
    struct no_commitlog {};
@@ -153,8 +245,34 @@ private:
    config _config;
    stats _stats;
    lw_shared_ptr<memtable_list> _memtables;
+
+    // In older incarnations, we simply commited the mutations to memtables.
+    // However, doing that makes it harder for us to provide QoS within the
+    // disk subsystem. Keeping them in separate memtables allow us to properly
+    // classify those streams into its own I/O class
+    //
+    // We could write those directly to disk, but we still want the mutations
+    // coming through the wire to go to a memtable staging area.  This has two
+    // major advantages:
+    //
+    // first, it will allow us to properly order the partitions. They are
+    // hopefuly sent in order but we can't really guarantee that without
+    // sacrificing sender-side parallelism.
+    //
+    // second, we will be able to coalesce writes from multiple plan_id's and
+    // even multiple senders, as well as automatically tapping into the dirty
+    // memory throttling mechanism, guaranteeing we will not overload the
+    // server.
+    lw_shared_ptr<memtable_list> _streaming_memtables;
+
    // generation -> sstable. Ordered by key so we can easily get the most recent.
    lw_shared_ptr<sstable_list> _sstables;
+    // sstables that have been compacted (so don't look up in query) but
+    // have not been deleted yet, so must not GC any tombstones in other sstables
+    // that may delete data in these sstables:
+    std::vector<sstables::shared_sstable> _sstables_compacted_but_not_deleted;
+    // Control background fibers waiting for sstables to be deleted
+    seastar::gate _sstable_deletion_gate;
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
    rwlock _sstables_lock;
@@ -171,11 +289,20 @@ private:
    int _compaction_disabled = 0;
    class memtable_flush_queue;
    std::unique_ptr<memtable_flush_queue> _flush_queue;
+    // Because streaming mutations bypass the commitlog, there is
+    // no need for the complications of the flush queue. Besides, it
+    // is easier to just use a common gate than it is to modify the flush_queue
+    // to work both with and without a replay position.
+    //
+    // Last but not least, we seldom need to guarantee any ordering here: as long
+    // as all data is waited for, we're good.
+    seastar::gate _streaming_flush_gate;
 private:
    void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable);
    void add_sstable(sstables::sstable&& sstable);
    void add_sstable(lw_shared_ptr<sstables::sstable> sstable);
-    void add_memtable();
+    lw_shared_ptr<memtable> new_memtable();
+    lw_shared_ptr<memtable> new_streaming_memtable();
    future<stop_iteration> try_flush_memtable_to_sstable(lw_shared_ptr<memtable> memt);
    future<> update_cache(memtable&, lw_shared_ptr<sstable_list> old_sstables);
    struct merge_comparator;
@@ -198,6 +325,7 @@ private:
    // Rebuild existing _sstables with new_sstables added to it and sstables_to_remove removed from it.
    void rebuild_sstable_list(const std::vector<sstables::shared_sstable>& new_sstables,
                              const std::vector<sstables::shared_sstable>& sstables_to_remove);
+    void rebuild_statistics();
 private:
    // Creates a mutation reader which covers sstables.
    // Caller needs to ensure that column_family remains live (FIXME: relax this).
@@ -251,7 +379,7 @@ public:
    // FIXME: in case a query is satisfied from a single memtable, avoid a copy
    using const_mutation_partition_ptr = std::unique_ptr<const mutation_partition>;
    using const_row_ptr = std::unique_ptr<const row>;
-    memtable& active_memtable() { return *_memtables->back(); }
+    memtable& active_memtable() { return _memtables->active_memtable(); }
    const row_cache& get_row_cache() const {
        return _cache;
    }
@@ -276,6 +404,7 @@ public:
    // The mutation is always upgraded to current schema.
    void apply(const frozen_mutation& m, const schema_ptr& m_schema, const db::replay_position& = db::replay_position());
    void apply(const mutation& m, const db::replay_position& = db::replay_position());
+    void apply_streaming_mutation(schema_ptr, const frozen_mutation&);

    // Returns at most "cmd.limit" rows
    future<lw_shared_ptr<query::result>> query(schema_ptr,
@@ -288,6 +417,7 @@ public:
    future<> stop();
    future<> flush();
    future<> flush(const db::replay_position&);
+    future<> flush_streaming_mutations(std::vector<query::partition_range> ranges = std::vector<query::partition_range>{});
    void clear(); // discards memtable(s) without flushing them to disk.
    future<db::replay_position> discard_sstables(db_clock::time_point);

@@ -298,7 +428,10 @@ public:
    future<int64_t> disable_sstable_write() {
        _sstable_writes_disabled_at = std::chrono::steady_clock::now();
        return _sstables_lock.write_lock().then([this] {
-            return make_ready_future<int64_t>((*_sstables->end()).first);
+            if (_sstables->empty()) {
+                return make_ready_future<int64_t>(0);
+            }
+            return make_ready_future<int64_t>((*_sstables->rbegin()).first);
        });
    }

@@ -321,9 +454,11 @@ public:
    // very dangerous to do that with live SSTables. This is meant to be used with SSTables
    // that are not yet managed by the system.
    //
+    // Parameter all_generations stores the generation of all SSTables in the system, so it
+    // will be easy to determine which SSTable is new.
    // An example usage would query all shards asking what is the highest SSTable number known
    // to them, and then pass that + 1 as "start".
-    future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(int64_t start);
+    future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(std::set<int64_t> all_generations, int64_t start);

    // FIXME: this is just an example, should be changed to something more
    // general. compact_all_sstables() starts a compaction of all sstables.
@@ -357,6 +492,7 @@ public:
    }

    lw_shared_ptr<sstable_list> get_sstables();
+    lw_shared_ptr<sstable_list> get_sstables_including_compacted_undeleted();
    size_t sstables_count();
    int64_t get_unleveled_sstables() const;

@@ -408,6 +544,31 @@ private:
    // synchronously flush data to disk.
    future<> seal_active_memtable();

+    // I am assuming here that the repair process will potentially send ranges containing
+    // few mutations, definitely not enough to fill a memtable. It wants to know whether or
+    // not each of those ranges individually succeeded or failed, so we need a future for
+    // each.
+    //
+    // One of the ways to fix that, is changing the repair itself to send more mutations at
+    // a single batch. But relying on that is a bad idea for two reasons:
+    //
+    // First, the goals of the SSTable writer and the repair sender are at odds. The SSTable
+    // writer wants to write as few SSTables as possible, while the repair sender wants to
+    // break down the range in pieces as small as it can and checksum them individually, so
+    // it doesn't have to send a lot of mutations for no reason.
+    //
+    // Second, even if the repair process wants to process larger ranges at once, some ranges
+    // themselves may be small. So while most ranges would be large, we would still have
+    // potentially some fairly small SSTables lying around.
+    //
+    // The best course of action in this case is to coalesce the incoming streams write-side.
+    // repair can now choose whatever strategy - small or big ranges - it wants, resting assure
+    // that the incoming memtables will be coalesced together.
+    shared_promise<> _waiting_streaming_flushes;
+    timer<> _delayed_streaming_flush{[this] { seal_active_streaming_memtable(); }};
+    future<> seal_active_streaming_memtable();
+    future<> seal_active_streaming_memtable_delayed();
+
    // filter manifest.json files out
    static bool manifest_json_filter(const sstring& fname);

@@ -417,7 +578,6 @@ private:
    template <typename Func>
    future<bool> for_all_partitions(schema_ptr, Func&& func) const;
    future<sstables::entry_descriptor> probe_file(sstring sstdir, sstring fname);
-    void seal_on_overflow();
    void check_valid_rp(const db::replay_position&) const;
 public:
    // Iterate over all partitions.  Protocol is the same as std::all_of(),
@@ -521,6 +681,7 @@ public:
        bool enable_incremental_backups = false;
        size_t max_memtable_size = 5'000'000;
        logalloc::region_group* dirty_memory_region_group = nullptr;
+        logalloc::region_group* streaming_dirty_memory_region_group = nullptr;
        ::cf_stats* cf_stats = nullptr;
    };
 private:
@@ -582,6 +743,7 @@ public:
 class database {
    ::cf_stats _cf_stats;
    logalloc::region_group _dirty_memory_region_group;
+    logalloc::region_group _streaming_dirty_memory_region_group;
    std::unordered_map<sstring, keyspace> _keyspaces;
    std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
    std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash> _ks_cf_to_uuid;
@@ -592,8 +754,6 @@ class database {
    // compaction_manager object is referenced by all column families of a database.
    compaction_manager _compaction_manager;
    std::vector<scollectd::registration> _collectd;
-    timer<> _throttling_timer{[this] { unthrottle(); }};
-    circular_buffer<promise<>> _throttled_requests;
    bool _enable_incremental_backups = false;

    future<> init_commitlog();
@@ -608,9 +768,34 @@ private:
    void create_in_memory_keyspace(const lw_shared_ptr<keyspace_metadata>& ksm);
    friend void db::system_keyspace::make(database& db, bool durable, bool volatile_testing_only);
    void setup_collectd();
-    future<> throttle();
+
+    class throttle_state {
+        size_t _max_space;
+        logalloc::region_group& _region_group;
+        throttle_state* _parent;
+
+        circular_buffer<promise<>> _throttled_requests;
+        timer<> _throttling_timer{[this] { unthrottle(); }};
+        void unthrottle();
+        bool should_throttle() const {
+            if (_region_group.memory_used() > _max_space) {
+                return true;
+            }
+            if (_parent) {
+                return _parent->should_throttle();
+            }
+            return false;
+        }
+    public:
+        throttle_state(size_t max_space, logalloc::region_group& region);
+        throttle_state(size_t max_space, logalloc::region_group& region, throttle_state& parent);
+        future<> throttle();
+    };
+
+    throttle_state _memtables_throttler;
+    throttle_state _streaming_throttler;
+
    future<> do_apply(schema_ptr, const frozen_mutation&);
-    void unthrottle();
 public:
    static utils::UUID empty_version;

@@ -678,6 +863,7 @@ public:
    future<lw_shared_ptr<query::result>> query(schema_ptr, const query::read_command& cmd, query::result_request request, const std::vector<query::partition_range>& ranges);
    future<reconcilable_result> query_mutations(schema_ptr, const query::read_command& cmd, const query::partition_range& range);
    future<> apply(schema_ptr, const frozen_mutation&);
+    future<> apply_streaming_mutation(schema_ptr, const frozen_mutation&);
    keyspace::config make_keyspace_config(const keyspace_metadata& ksm);
    const sstring& get_snitch_name() const;
    future<> clear_snapshot(sstring tag, std::vector<sstring> keyspace_names);
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1043,7 +1043,9 @@ void db::commitlog::segment_manager::flush_segments(bool force) {

 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
    descriptor d(next_id());
-    return open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create).then([this, d, active](file f) {
+    file_open_options opt;
+    opt.extent_allocation_size_hint = max_size;
+    return open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create, opt).then([this, d, active](file f) {
        // xfs doesn't like files extended betond eof, so enlarge the file
        return f.truncate(max_size).then([this, d, active, f] () mutable {
            auto s = make_lw_shared<segment>(this->shared_from_this(), d, std::move(f), active);
--- a/db/config.hh
+++ b/db/config.hh
@@ -487,7 +487,7 @@ public:
    val(cas_contention_timeout_in_ms, uint32_t, 5000, Unused,     \
            "The time that the coordinator continues to retry a CAS (compare and set) operation that contends with other proposals for the same row."  \
    )   \
-    val(truncate_request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(truncate_request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The time that the coordinator waits for truncates (remove all data from a table) to complete. The long default value allows for a snapshot to be taken before removing the data. If auto_snapshot is disabled (not recommended), you can reduce this time."  \
    )   \
    val(write_request_timeout_in_ms, uint32_t, 2000, Used,     \
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -663,7 +663,7 @@ future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& p
    });
 }

-static void update_column_family(database& db, schema_ptr new_schema) {
+static future<> update_column_family(database& db, schema_ptr new_schema) {
    column_family& cfm = db.find_column_family(new_schema->id());

    bool columns_changed = !cfm.schema()->equal_columns(*new_schema);
@@ -672,7 +672,7 @@ static void update_column_family(database& db, schema_ptr new_schema) {
    s->registry_entry()->mark_synced();
    cfm.set_schema(std::move(s));

-    service::get_local_migration_manager().notify_update_column_family(cfm.schema(), columns_changed);
+    return service::get_local_migration_manager().notify_update_column_family(cfm.schema(), columns_changed);
 }

 // see the comments for merge_keyspaces()
@@ -713,15 +713,15 @@ static void merge_tables(distributed<service::storage_proxy>& proxy,
                    auto& cf = db.find_column_family(s);
                    cf.mark_ready_for_writes();
                    ks.make_directory_for_column_family(s->cf_name(), s->id()).get();
-                    service::get_local_migration_manager().notify_create_column_family(s);
+                    service::get_local_migration_manager().notify_create_column_family(s).get();
                }
                for (auto&& gs : altered) {
-                    update_column_family(db, gs.get());
+                    update_column_family(db, gs.get()).get();
                }
                parallel_for_each(dropped.begin(), dropped.end(), [&db, &tsf](auto&& gs) {
                    schema_ptr s = gs.get();
                    return db.drop_column_family(s->ks_name(), s->cf_name(), [&tsf] { return tsf.value(); }).then([s] {
-                        service::get_local_migration_manager().notify_drop_column_family(s);
+                        return service::get_local_migration_manager().notify_drop_column_family(s);
                    });
                }).get();
            });
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -29,28 +29,74 @@ while [ $# -gt 0 ]; do
    esac
 done

+. /etc/os-release
+case "$ID" in
+    "centos")
+        AMI=ami-f3102499
+        REGION=us-east-1
+        SSH_USERNAME=centos
+        ;;
+    "ubuntu")
+        AMI=ami-ff427095
+        REGION=us-east-1
+        SSH_USERNAME=ubuntu
+        ;;
+    *)
+        echo "build_ami.sh does not supported this distribution."
+        exit 1
+        ;;
+esac
+
+
 if [ $LOCALRPM -eq 1 ]; then
-    rm -rf build/*
-    sudo yum -y install git
-    if [ ! -f dist/ami/files/scylla-server.x86_64.rpm ]; then
-        dist/redhat/build_rpm.sh
-        cp build/rpmbuild/RPMS/x86_64/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
-    fi
-    if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
-        cd build
-        git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
-        cd scylla-jmx
-        sh -x -e dist/redhat/build_rpm.sh $*
-        cd ../..
-        cp build/scylla-jmx/build/rpmbuild/RPMS/noarch/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
-    fi
-    if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ]; then
-        cd build
-        git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
-        cd scylla-tools-java
-        sh -x -e dist/redhat/build_rpm.sh
-        cd ../..
-        cp build/scylla-tools-java/build/rpmbuild/RPMS/noarch/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
+    if [ "$ID" = "centos" ]; then
+        rm -rf build/*
+        sudo yum -y install git
+        if [ ! -f dist/ami/files/scylla-server.x86_64.rpm ]; then
+            dist/redhat/build_rpm.sh
+            cp build/rpmbuild/RPMS/x86_64/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
+        fi
+        if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
+            cd build
+            git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
+            cd scylla-jmx
+            sh -x -e dist/redhat/build_rpm.sh $*
+            cd ../..
+            cp build/scylla-jmx/build/rpmbuild/RPMS/noarch/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
+        fi
+        if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ]; then
+            cd build
+            git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
+            cd scylla-tools-java
+            sh -x -e dist/redhat/build_rpm.sh
+            cd ../..
+            cp build/scylla-tools-java/build/rpmbuild/RPMS/noarch/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
+        fi
+    else
+        sudo apt-get install -y git
+        if [ ! -f dist/ami/files/scylla-server_amd64.deb ]; then
+            if [ ! -f ../scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb ]; then
+                echo "Build .deb before running build_ami.sh"
+                exit 1
+            fi
+            cp ../scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-server_amd64.deb
+        fi
+        if [ ! -f dist/ami/files/scylla-jmx_all.deb ]; then
+            cd build
+            git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
+            cd scylla-jmx
+            sh -x -e dist/ubuntu/build_deb.sh $*
+            cd ../..
+            cp build/scylla-jmx_`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`-ubuntu1_all.deb dist/ami/files/scylla-jmx_all.deb
+        fi
+        if [ ! -f dist/ami/files/scylla-tools_all.deb ]; then
+            cd build
+            git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
+            cd scylla-tools-java
+            sh -x -e dist/ubuntu/build_deb.sh $*
+            cd ../..
+            cp build/scylla-tools_`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`-ubuntu1_all.deb dist/ami/files/scylla-tools_all.deb
+        fi
    fi
 fi

@@ -69,4 +115,4 @@ if [ ! -d packer ]; then
    cd -
 fi

-packer/packer build -var-file=variables.json -var install_args="$INSTALL_ARGS" scylla.json
+packer/packer build -var-file=variables.json -var install_args="$INSTALL_ARGS" -var region="$REGION" -var source_ami="$AMI" -var ssh_username="$SSH_USERNAME" scylla.json
--- a/dist/ami/files/.bash_profile
+++ b/dist/ami/files/.bash_profile
@@ -30,7 +30,21 @@ echo 'More documentation available at: '
 echo '	http://www.scylladb.com/doc/'
 echo

-if [ "`systemctl is-active scylla-server`" = "active" ]; then
+. /etc/os-release
+if [ "$ID" = "ubuntu" ]; then
+	if [ "`initctl status ssh|grep "running, process"`" != "" ]; then
+		STARTED=1
+	else
+		STARTED=0
+	fi
+else
+	if [ "`systemctl is-active scylla-server`" = "active" ]; then
+		STARTED=1
+	else
+		STARTED=0
+	fi
+fi
+if [ $STARTED -eq 1 ]; then
 	tput setaf 4
 	tput bold
 	echo "    ScyllaDB is active."
@@ -42,6 +56,13 @@ else
 	echo "    ScyllaDB is not started!"
 	tput sgr0
 	echo "Please wait for startup. To see status of ScyllaDB, run "
-	echo " 'systemctl status scylla-server'"
-	echo
+	if [ "$ID" = "ubuntu" ]; then
+		echo " 'initctl status scylla-server'"
+		echo "and"
+		echo " 'cat /var/log/upstart/scylla-server.log'"
+		echo
+	else
+		echo " 'systemctl status scylla-server'"
+		echo
+	fi
 fi
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/ami/scylla.json
+++ b/dist/ami/scylla.json
@@ -8,10 +8,10 @@
      "security_group_id": "{{user `security_group_id`}}",
      "region": "{{user `region`}}",
      "associate_public_ip_address": "{{user `associate_public_ip_address`}}",
-      "source_ami": "ami-f3102499",
+      "source_ami": "{{user `source_ami`}}",
      "user_data_file": "user_data.txt",
      "instance_type": "{{user `instance_type`}}",
-      "ssh_username": "centos",
+      "ssh_username": "{{user `ssh_username`}}",
      "ssh_timeout": "5m",
      "ami_name": "{{user `ami_prefix`}}scylla_{{isotime | clean_ami_name}}",
      "enhanced_networking": true,
@@ -62,17 +62,17 @@
    {
      "type": "file",
      "source": "files/",
-      "destination": "/home/centos/"
+      "destination": "/home/{{user `ssh_username`}}/"
    },
    {
      "type": "file",
      "source": "../../scripts/scylla_install_pkg",
-      "destination": "/home/centos/scylla_install_pkg"
+      "destination": "/home/{{user `ssh_username`}}/scylla_install_pkg"
    },
    {
      "type": "shell",
      "inline": [
-         "sudo /home/centos/scylla-ami/scylla_install_ami {{ user `install_args` }}"
+         "sudo /home/{{user `ssh_username`}}/scylla-ami/scylla_install_ami {{ user `install_args` }}"
       ]
    }
  ],
@@ -85,6 +85,8 @@
    "associate_public_ip_address": "",
    "instance_type": "",
    "install_args": "",
-    "ami_prefix": ""
+    "ami_prefix": "",
+    "source_ami": "",
+    "ssh_username": ""
  }
 }
--- a/dist/common/collectd.d/scylla.conf
+++ b/dist/common/collectd.d/scylla.conf
@@ -1,5 +1,12 @@
 LoadPlugin network
 LoadPlugin unixsock
+
+# dummy write_graphite to silent noisy warning
+LoadPlugin network
+<Plugin "network">
+        Server "127.0.0.1 65534"
+</Plugin>
+
 <Plugin network>
 	Listen "127.0.0.1" "25826"
 </Plugin>
--- a/dist/common/scripts/scylla_bootparam_setup
+++ b/dist/common/scripts/scylla_bootparam_setup
@@ -2,6 +2,25 @@
 #
 #  Copyright (C) 2015 ScyllaDB

+print_usage() {
+    echo "scylla_bootparam_setup --ami"
+    echo "  --ami				setup AMI instance"
+    exit 1
+}
+
+AMI_OPT=0
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--ami")
+            AMI_OPT=1
+            shift 1
+            ;;
+        *)
+            print_usage
+            ;;
+    esac
+done
+
 . /etc/os-release

 if [ ! -f /etc/default/grub ]; then
@@ -14,7 +33,11 @@ if [ "`grep hugepagesz /etc/default/grub`" != "" ] || [ "`grep hugepages /etc/de
    sed -e "s#hugepages=[0-9]* ##" /etc/default/grub > /tmp/grub
    mv /tmp/grub /etc/default/grub
 fi
-sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
+if [ $AMI_OPT -eq 1 ]; then
+    sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"clocksource=tsc tsc=reliable hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
+else
+    sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
+fi
 mv /tmp/grub /etc/default/grub
 if [ "$ID" = "ubuntu" ]; then
    grub-mkconfig -o /boot/grub/grub.cfg
--- a/dist/common/scripts/scylla_dev_mode_setup
+++ b/dist/common/scripts/scylla_dev_mode_setup
@@ -0,0 +1,31 @@
+#!/bin/sh -e
+#
+#  Copyright (C) 2015 ScyllaDB
+
+print_usage() {
+    echo "scylla_developer_mode_setup --developer-mode=[0|1]"
+    echo "  --developer-mode   enable/disable developer mode"
+    exit 1
+}
+
+DEV_MODE=
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--developer-mode")
+            DEV_MODE=$2
+            shift 2
+            ;;
+        *)
+            print_usage
+            ;;
+    esac
+done
+
+if [ "$DEV_MODE" = "" ]; then
+    print_usage
+fi
+if [ "$DEV_MODE" != "0" ] && [ "$DEV_MODE" != "1" ]; then
+    print_usage
+fi
+
+echo "DEV_MODE=--developer-mode=$DEV_MODE" > /etc/scylla.d/dev-mode.conf
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -1,31 +1,53 @@
 #!/bin/sh

-is_ami() {
-    if [ "`dmidecode --string system-version | grep \.amazon`" != "" ] && \
-       [ "`curl http://169.254.169.254/latest/meta-data/ami-id | grep ami-`" != "" ]; then
-         echo 1
-    else
-         echo 0
-    fi
+print_usage() {
+    echo "scylla_io_setup --ami"
+    echo "  --ami				setup AMI instance"
+    exit 1
 }

-is_supported_instance_type() {
-    TYPE=`curl http://169.254.169.254/latest/meta-data/instance-type|cut -d . -f 1`
-    case $TYPE in
-        "m3"|"c3"|"i2") echo 1;;
-        *) echo 0;;
+AMI_OPT=0
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--ami")
+            AMI_OPT=1
+            shift 1
+            ;;
+        *)
+            print_usage
+            ;;
    esac
-}
+done
+

 is_developer_mode() {
-    echo $SCYLLA_ARGS|egrep -c "\-\-developer-mode(\s+|=)1"
+    cat /etc/scylla.d/dev-mode.conf|egrep -c "\-\-developer-mode(\s+|=)(1|true)"
 }

-if [ ! -f /etc/scylla/io_configured ] && [ `is_developer_mode` -eq 0 ]; then
-    if [ `is_ami` -eq 1 ] && [ `is_supported_instance_type` -eq 1 ]; then
-        NR_CPU=`cat /proc/cpuinfo |grep processor|wc -l`
-        NR_DISKS=`curl http://169.254.169.254/latest/meta-data/block-device-mapping/|grep ephemeral|wc -l`
+output_to_user()
+{
+    echo "$1"
+    logger -p user.err "$1"
+}

+. /etc/os-release
+if [ "$NAME" = "Ubuntu" ]; then
+   . /etc/default/scylla-server
+else
+   . /etc/sysconfig/scylla-server
+fi
+
+if [ `is_developer_mode` -eq 0 ]; then
+    SMP=`echo $SCYLLA_ARGS|grep smp|sed -e "s/^.*smp\(\s\+\|=\)\([0-9]*\).*$/\2/"`
+    CPUSET=`echo $SCYLLA_ARGS|grep cpuset|sed -e "s/^.*\(--cpuset\(\s\+\|=\)[0-9\-]*\).*$/\1/"`
+    if [ $AMI_OPT -eq 1 ]; then
+        NR_CPU=`cat /proc/cpuinfo |grep processor|wc -l`
+        NR_DISKS=`lsblk --list --nodeps --noheadings | grep -v xvda | grep xvd | wc -l`
+        TYPE=`curl http://169.254.169.254/latest/meta-data/instance-type|cut -d . -f 1`
+
+        if [ "$SMP" != "" ]; then
+            NR_CPU=$SMP
+        fi
        NR_SHARDS=$NR_CPU
        if [ $NR_CPU -ge 8 ] && [ "$SET_NIC" = "no" ]; then
            NR_SHARDS=$((NR_CPU - 1))
@@ -39,17 +61,20 @@ if [ ! -f /etc/scylla/io_configured ] && [ `is_developer_mode` -eq 0 ]; then
            NR_IO_QUEUES=$(($NR_REQS / 4))
        fi

+        NR_IO_QUEUES=$((NR_IO_QUEUES>NR_SHARDS?NR_SHARDS:NR_IO_QUEUES))
        NR_REQS=$(($(($NR_REQS / $NR_IO_QUEUES)) * $NR_IO_QUEUES))
+        if [ "$TYPE" = "i2" ]; then
+            NR_REQS=$(($NR_REQS * 2))
+        fi

        echo "SEASTAR_IO=\"--num-io-queues $NR_IO_QUEUES --max-io-requests $NR_REQS\"" > /etc/scylla.d/io.conf
    else
-        iotune --evaluation-directory /var/lib/scylla --format envfile --options-file /etc/scylla.d/io.conf
+        iotune --evaluation-directory /var/lib/scylla --format envfile --options-file /etc/scylla.d/io.conf $CPUSET
        if [ $? -ne 0 ]; then
-            logger -p user.err "/var/lib/scylla did not pass validation tests, it may not be on XFS and/or has limited disk space."
-            logger -p user.err "This is a non-supported setup, and performance is expected to be very bad."
-            logger -p user.err "For better performance, placing your data on XFS-formatted directories is required."
-            logger -p user.err " To override this error, see the developer_mode configuration option."
+            output_to_user "/var/lib/scylla did not pass validation tests, it may not be on XFS and/or has limited disk space."
+            output_to_user "This is a non-supported setup, and performance is expected to be very bad."
+            output_to_user "For better performance, placing your data on XFS-formatted directories is required."
+            output_to_user " To override this error, see the developer_mode configuration option."
        fi
    fi
-    touch /etc/scylla/io_configured
 fi
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -49,7 +49,7 @@ fi

 . /etc/os-release
 if [ "$NAME" = "Ubuntu" ]; then
-    apt-get -y install mdadm xfsprogs
+    env DEBIAN_FRONTEND=noninteractive apt-get -y install mdadm xfsprogs
 else
    yum -y install mdadm xfsprogs
 fi
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -8,11 +8,12 @@ if [ "`id -u`" -ne 0 ]; then
 fi

 print_usage() {
-    echo "scylla_setup --disks /dev/hda,/dev/hdb... --nic eth0 --ntp-domain centos --ami --no-enable-service --no-selinux-setup --no-bootparam-setup --no-ntp-setup --no-raid-setup --no-coredump-setup --no-sysconfig-setup"
+    echo "scylla_setup --disks /dev/hda,/dev/hdb... --nic eth0 --ntp-domain centos --ami --developer-mode --no-enable-service --no-selinux-setup --no-bootparam-setup --no-ntp-setup --no-raid-setup --no-coredump-setup --no-sysconfig-setup"
    echo "  --disks			specify disks for RAID"
    echo "  --nic				specify NIC"
    echo "  --ntp-domain			specify NTP domain"
    echo "  --ami				setup AMI instance"
+    echo "  --developer-mode			enable developer mode"
    echo "  --no-enable-service		skip enabling service"
    echo "  --no-selinux-setup		skip selinux setup"
    echo "  --no-bootparam-setup		skip bootparam setup"
@@ -20,6 +21,7 @@ print_usage() {
    echo "  --no-raid-setup		skip raid setup"
    echo "  --no-coredump-setup		skip coredump setup"
    echo "  --no-sysconfig-setup		skip sysconfig setup"
+    echo "  --no-io-setup		skip IO configuration setup"
    exit 1
 }

@@ -40,6 +42,7 @@ interactive_ask_service() {
 }

 AMI=0
+DEV_MODE=0
 ENABLE_SERVICE=1
 SELINUX_SETUP=1
 BOOTPARAM_SETUP=1
@@ -47,6 +50,7 @@ NTP_SETUP=1
 RAID_SETUP=1
 COREDUMP_SETUP=1
 SYSCONFIG_SETUP=1
+IO_SETUP=1

 if [ $# -ne 0 ]; then
    INTERACTIVE=0
@@ -72,6 +76,10 @@ while [ $# -gt 0 ]; do
            AMI=1
            shift 1
            ;;
+        "--developer-mode")
+            DEV_MODE=1
+            shift 1
+            ;;
        "--no-enable-service")
            ENABLE_SERVICE=0
            shift 1
@@ -100,6 +108,10 @@ while [ $# -gt 0 ]; do
            SYSCONFIG_SETUP=0
            shift 1
            ;;
+        "--no-io-setup")
+            IO_SETUP=0
+            shift 1
+            ;;
        "-h" | "--help")
            print_usage
            shift 1
@@ -122,9 +134,9 @@ if [ $INTERACTIVE -eq 1 ]; then
 fi
 if [ $ENABLE_SERVICE -eq 1 ]; then
    if [ "$ID" = "fedora" ] || [ "$ID" = "centos" ]; then
-        systemctl enable scylla-io-setup.service
        systemctl enable scylla-server.service
        systemctl enable scylla-jmx.service
+        systemctl enable collectd.service
    fi
 fi

@@ -162,21 +174,21 @@ if [ $INTERACTIVE -eq 1 ]; then
    if [ $RAID_SETUP -eq 1 ]; then
        echo "Please select disks from following list: "
        while true; do
-            lsblk -d -i -n -p -r|awk '{print $1}'|sed -e ':loop;N;$!b loop;s/\n/ /g'
+            lsblk -d -i -n -r|awk '{print $1}'|sed -e ':loop;N;$!b loop;s/\n/ /g'
            echo "type 'done' to finish selection. selected: $DISKS"
            echo -n "> "
            read dsk
            if [ "$dsk" = "done" ]; then
                break
            fi
-            if [ -e $dsk ]; then
+            if [ -e /dev/$dsk ]; then
                if [ "$DISKS" = "" ]; then
-                    DISKS=$dsk
+                    DISKS=/dev/$dsk
                else
-                    DISKS="$DISKS,$dsk"
+                    DISKS="$DISKS,/dev/$dsk"
                fi
            else
-                echo "$dsk not found"
+                echo "/dev/$dsk not found"
            fi
        done
    fi
@@ -212,6 +224,18 @@ if [ $INTERACTIVE -eq 1 ]; then
        done
    fi
 fi
+
+if [ $INTERACTIVE -eq 1 ]; then
+    interactive_ask_service "Do you want to setup IO configuration?" &&:
+    IO_SETUP=$?
+fi
+if [ $IO_SETUP -eq 1 ]; then
+    /usr/lib/scylla/scylla_io_setup
+fi
+
 if [ $SYSCONFIG_SETUP -eq 1 ]; then
    /usr/lib/scylla/scylla_sysconfig_setup --nic $NIC
 fi
+if [ $DEV_MODE -eq 1 ]; then
+    /usr/lib/scylla/scylla_dev_mode_setup --developer-mode 1
+fi
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -76,7 +76,7 @@ echo Setting parameters on $SYSCONFIG/scylla-server
 ETHDRV=`/usr/lib/scylla/dpdk_nic_bind.py --status | grep if=$NIC | sed -e "s/^.*drv=//" -e "s/ .*$//"`
 ETHPCIID=`/usr/lib/scylla/dpdk_nic_bind.py --status | grep if=$NIC | awk '{print $1}'`
 NR_CPU=`cat /proc/cpuinfo |grep processor|wc -l`
-if [ $NR_CPU -ge 8 ] && [ "$SET_NIC" = "no" ]; then
+if [ "$AMI" = "yes" ] && [ $NR_CPU -ge 8 ] && [ "$SET_NIC" = "no" ]; then
    NR=$((NR_CPU - 1))
    SET_NIC="yes"
    SCYLLA_ARGS="$SCYLLA_ARGS --cpuset 1-$NR  --smp $NR"
--- a/dist/common/scylla.d/dev-mode.conf
+++ b/dist/common/scylla.d/dev-mode.conf
@@ -0,0 +1,4 @@
+# DO NO EDIT
+# This file should be automatically configure by scylla_dev_mode_setup
+#
+# DEV_MODE=--developer-mode=0
--- a/dist/common/scylla.d/io.conf
+++ b/dist/common/scylla.d/io.conf
@@ -1,4 +1,4 @@
 # DO NO EDIT
-# This file should be automatically configure by scylla-io-setup.service
+# This file should be automatically configure by scylla_io_setup
 #
 # SEASTAR_IO="--max-io-requests=1 --num-io-queues=1"
--- a/dist/common/sudoers.d/scylla
+++ b/dist/common/sudoers.d/scylla
@@ -1 +1 @@
-scylla ALL=(ALL) NOPASSWD:SETENV: /usr/lib/scylla/scylla_prepare,/usr/lib/scylla/scylla_stop,/usr/lib/scylla/scylla_io_setup
+scylla ALL=(ALL) NOPASSWD:SETENV: /usr/lib/scylla/scylla_prepare,/usr/lib/scylla/scylla_stop,/usr/lib/scylla/scylla_io_setup,/usr/lib/scylla/scylla-ami/scylla_ami_setup
--- a/dist/docker/Dockerfile
+++ b/dist/docker/Dockerfile
@@ -4,6 +4,7 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>

 RUN yum -y install epel-release
 ADD scylla.repo /etc/yum.repos.d/
+RUN yum -y clean expire-cache
 RUN yum -y update
 RUN yum -y remove boost-thread boost-system
 RUN yum -y install scylla-server hostname
--- a/dist/redhat/scylla-server.spec.in
+++ b/dist/redhat/scylla-server.spec.in
@@ -113,11 +113,9 @@ if [ -f /etc/systemd/coredump.conf ];then
    /usr/lib/scylla/scylla_coredump_setup
 fi
 %systemd_post scylla-server.service
-%systemd_post scylla-io-setup.service

 %preun
 %systemd_preun scylla-server.service
-%systemd_preun scylla-io-setup.service

 %postun
 %systemd_postun
@@ -151,7 +149,6 @@ rm -rf $RPM_BUILD_ROOT
 %{_docdir}/scylla/ORIGIN
 %{_docdir}/scylla/licenses/
 %{_unitdir}/scylla-server.service
-%{_unitdir}/scylla-io-setup.service
 %{_bindir}/scylla
 %{_bindir}/iotune
 %{_bindir}/scyllatop
@@ -165,6 +162,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_prefix}/lib/scylla/scylla_ntp_setup
 %{_prefix}/lib/scylla/scylla_selinux_setup
 %{_prefix}/lib/scylla/scylla_io_setup
+%{_prefix}/lib/scylla/scylla_dev_mode_setup
 %{_prefix}/lib/scylla/posix_net_conf.sh
 %{_prefix}/lib/scylla/dpdk_nic_bind.py
 %{_prefix}/lib/scylla/dpdk_nic_bind.pyc
--- a/dist/redhat/systemd/scylla-io-setup.service
+++ b/dist/redhat/systemd/scylla-io-setup.service
@@ -1,10 +0,0 @@
-[Unit]
-Description=Scylla IO Setup
-After=network.target
-
-[Service]
-Type=oneshot
-EnvironmentFile=/etc/sysconfig/scylla-server
-ExecStart=/usr/lib/scylla/scylla_io_setup
-RemainAfterExit=yes
-TimeoutStartSec=1800
--- a/dist/redhat/systemd/scylla-server.service
+++ b/dist/redhat/systemd/scylla-server.service
@@ -1,7 +1,5 @@
 [Unit]
 Description=Scylla Server
-After=scylla-io-setup.service
-Requires=scylla-io-setup.service

 [Service]
 Type=notify
@@ -14,7 +12,7 @@ Environment="HOME=/var/lib/scylla"
 EnvironmentFile=/etc/sysconfig/scylla-server
 EnvironmentFile=/etc/scylla.d/*.conf
 ExecStartPre=/usr/bin/sudo -E /usr/lib/scylla/scylla_prepare
-ExecStart=/usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO
+ExecStart=/usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE
 ExecStopPost=/usr/bin/sudo -E /usr/lib/scylla/scylla_stop
 TimeoutStartSec=900
 KillMode=process
--- a/dist/ubuntu/build_deb.sh
+++ b/dist/ubuntu/build_deb.sh
@@ -32,7 +32,7 @@ if [ `grep -c $RELEASE dist/ubuntu/supported_release` -lt 1 ]; then
 fi

 VERSION=$(./SCYLLA-VERSION-GEN)
-SCYLLA_VERSION=$(cat build/SCYLLA-VERSION-FILE)
+SCYLLA_VERSION=$(cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/')
 SCYLLA_RELEASE=$(cat build/SCYLLA-RELEASE-FILE)
 echo $VERSION > version
 ./scripts/git-archive-all --extra version --force-submodules --prefix scylla-server ../scylla-server_$SCYLLA_VERSION-$SCYLLA_RELEASE.orig.tar.gz 
--- a/dist/ubuntu/debian/scylla-server.init
+++ b/dist/ubuntu/debian/scylla-server.init
@@ -37,8 +37,10 @@ eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"

 do_start()
 {
+	if [ "$AMI" = "yes" ]; then
+		/usr/lib/scylla/scylla-ami/scylla_ami_setup
+ 	fi
 	/usr/lib/scylla/scylla_prepare	
-        /usr/lib/scylla/scylla_io_setup
 	# Return
 	#   0 if daemon has been started
 	#   1 if daemon was already running
--- a/dist/ubuntu/debian/scylla-server.upstart
+++ b/dist/ubuntu/debian/scylla-server.upstart
@@ -26,19 +26,30 @@ env HOME=/var/lib/scylla

 pre-start script
    eval "`grep -v -e "^\s*#" -e "^$" /etc/default/scylla-server|sed -e 's/^/export /'`"
-    eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"
+    . /etc/scylla.d/dev-mode.conf
+    . /etc/scylla.d/io.conf
+    export DEV_MODE
+    export SEASTAR_IO
+    if [ "$AMI" = "yes" ]; then
+        sudo /usr/lib/scylla/scylla-ami/scylla_ami_setup
+    fi
    sudo /usr/lib/scylla/scylla_prepare
-    sudo /usr/lib/scylla/scylla_io_setup
 end script

 script
    eval "`grep -v -e "^\s*#" -e "^$" /etc/default/scylla-server|sed -e 's/^/export /'`"
-    eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"
-    exec /usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO
+    . /etc/scylla.d/dev-mode.conf
+    . /etc/scylla.d/io.conf
+    export DEV_MODE
+    export SEASTAR_IO
+    exec /usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE
 end script

 post-stop script
    eval "`grep -v -e "^\s*#" -e "^$" /etc/default/scylla-server|sed -e 's/^/export /'`"
-    eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"
+    . /etc/scylla.d/dev-mode.conf
+    . /etc/scylla.d/io.conf
+    export DEV_MODE
+    export SEASTAR_IO
    sudo /usr/lib/scylla/scylla_stop
 end script
--- a/dist/ubuntu/rules.in
+++ b/dist/ubuntu/rules.in
@@ -35,7 +35,7 @@ override_dh_auto_install:
 	cp $(CURDIR)/dist/common/collectd.d/scylla.conf $(COLLECTD)

 	mkdir -p $(SCYLLAD) && \
-	cp $(CURDIR)/dist/common/scylla.d/io.conf $(SCYLLAD)
+	cp $(CURDIR)/dist/common/scylla.d/*.conf $(SCYLLAD)

 	mkdir -p $(CONF) && \
 	cp $(CURDIR)/conf/scylla.yaml $(CONF)
@@ -72,6 +72,9 @@ override_dh_auto_install:
 	mkdir -p $(CURDIR)/debian/scylla-server/var/lib/scylla/commitlog
 	mkdir -p $(CURDIR)/debian/scylla-server/var/lib/scylla/coredump

+override_dh_installinit:
+	dh_installinit --no-start
+
 override_dh_strip:
 	dh_strip --dbg-package=scylla-server-dbg
 %:
--- a/gms/application_state.cc
+++ b/gms/application_state.cc
@@ -62,7 +62,12 @@ static const std::map<application_state, sstring> application_state_names = {
 };

 std::ostream& operator<<(std::ostream& os, const application_state& m) {
-    os << application_state_names.at(m);
+    auto it = application_state_names.find(m);
+    if (it != application_state_names.end()) {
+        os << application_state_names.at(m);
+    } else {
+        os << "UNKNOWN";
+    }
    return os;
 }

--- a/idl/gossip_digest.idl.hh
+++ b/idl/gossip_digest.idl.hh
@@ -20,7 +20,8 @@
 */

 namespace gms {
-enum class application_state:int {STATUS = 0,
+enum class application_state:int {
+        STATUS = 0,
        LOAD,
        SCHEMA,
        DC,
@@ -29,6 +30,7 @@ enum class application_state:int {STATUS = 0,
        REMOVAL_COORDINATOR,
        INTERNAL_IP,
        RPC_ADDRESS,
+        X_11_PADDING,
        SEVERITY,
        NET_VERSION,
        HOST_ID,
--- a/main.cc
+++ b/main.cc
@@ -293,9 +293,19 @@ int main(int ac, char** av) {
            sstring broadcast_rpc_address = cfg->broadcast_rpc_address();

            if (!broadcast_address.empty()) {
-                utils::fb_utilities::set_broadcast_address(broadcast_address);
+                try {
+                    utils::fb_utilities::set_broadcast_address(broadcast_address);
+                } catch (...) {
+                    startlog.error("Bad configuration: invalid 'broadcast_address': {}: {}", broadcast_address, std::current_exception());
+                    throw bad_configuration_error();
+                }
            } else if (!listen_address.empty()) {
-                utils::fb_utilities::set_broadcast_address(listen_address);
+                try {
+                    utils::fb_utilities::set_broadcast_address(listen_address);
+                } catch (...) {
+                    startlog.error("Bad configuration: invalid 'listen_address': {}: {}", listen_address, std::current_exception());
+                    throw bad_configuration_error();
+                }
            } else {
                startlog.error("Bad configuration: neither listen_address nor broadcast_address are defined\n");
                throw bad_configuration_error();
@@ -352,11 +362,14 @@ int main(int ac, char** av) {
            print("Scylla API server listening on %s:%s ...\n", api_address, api_port);
            supervisor_notify("initializing storage service");
            init_storage_service(db).get();
-            api::set_server_storage_service(ctx).get();
            supervisor_notify("starting per-shard database core");
            // Note: changed from using a move here, because we want the config object intact.
            db.start(std::ref(*cfg)).get();
            engine().at_exit([&db] {
+                // A shared sstable must be compacted by all shards before it can be deleted.
+                // Since we're stoping, that's not going to happen.  Cancel those pending
+                // deletions to let anyone waiting on them to continue.
+                sstables::cancel_atomic_deletions();
                // #293 - do not stop anything - not even db (for real)
                //return db.stop();
                // call stop on each db instance, but leave the shareded<database> pointers alive.
@@ -422,14 +435,11 @@ int main(int ac, char** av) {
                    , seed_provider
                    , cluster_name
                    , phi).get();
-            api::set_server_gossip(ctx).get();
            supervisor_notify("starting messaging service");
-            api::set_server_messaging_service(ctx).get();
            supervisor_notify("starting storage proxy");
            proxy.start(std::ref(db)).get();
            // #293 - do not stop anything
            // engine().at_exit([&proxy] { return proxy.stop(); });
-            api::set_server_storage_proxy(ctx).get();
            supervisor_notify("starting migration manager");
            mm.start().get();
            // #293 - do not stop anything
@@ -458,7 +468,6 @@ int main(int ac, char** av) {
                }
                return db.load_sstables(proxy);
            }).get();
-            api::set_server_load_sstable(ctx).get();
            supervisor_notify("setting up system keyspace");
            db::system_keyspace::setup(db, qp).get();
            supervisor_notify("starting commit log");
@@ -479,6 +488,11 @@ int main(int ac, char** av) {
                    }
                }
            }
+            api::set_server_storage_service(ctx).get();
+            api::set_server_gossip(ctx).get();
+            api::set_server_messaging_service(ctx).get();
+            api::set_server_storage_proxy(ctx).get();
+            api::set_server_load_sstable(ctx).get();
            supervisor_notify("initializing migration manager RPC verbs");
            service::get_migration_manager().invoke_on_all([] (auto& mm) {
                mm.init_messaging_service();
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -360,6 +360,7 @@ void messaging_service::cache_preferred_ip(gms::inet_address ep, gms::inet_addre
 }

 shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::get_rpc_client(messaging_verb verb, msg_addr id) {
+    assert(!_stopping);
    auto idx = get_rpc_client_idx(verb);
    auto it = _clients[idx].find(id);

@@ -409,6 +410,13 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
 }

 void messaging_service::remove_rpc_client_one(clients_map& clients, msg_addr id, bool dead_only) {
+    if (_stopping) {
+        // if messaging service is in a processed of been stopped no need to
+        // stop and remove connection here since they are being stopped already
+        // and we'll just interfere
+        return;
+    }
+
    auto it = clients.find(id);
    if (it != clients.end() && (!dead_only || it->second.rpc_client->error())) {
        auto client = std::move(it->second.rpc_client);
@@ -442,8 +450,12 @@ std::unique_ptr<messaging_service::rpc_protocol_wrapper>& messaging_service::rpc
 // Send a message for verb
 template <typename MsgIn, typename... MsgOut>
 auto send_message(messaging_service* ms, messaging_verb verb, msg_addr id, MsgOut&&... msg) {
-    auto rpc_client_ptr = ms->get_rpc_client(verb, id);
    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
+    if (ms->is_stopping()) {
+        using futurator = futurize<std::result_of_t<decltype(rpc_handler)(rpc_protocol::client&, MsgOut...)>>;
+        return futurator::make_exception_future(rpc::closed_error());
+    }
+    auto rpc_client_ptr = ms->get_rpc_client(verb, id);
    auto& rpc_client = *rpc_client_ptr;
    return rpc_handler(rpc_client, std::forward<MsgOut>(msg)...).then_wrapped([ms = ms->shared_from_this(), id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (auto&& f) {
        try {
@@ -467,8 +479,12 @@ auto send_message(messaging_service* ms, messaging_verb verb, msg_addr id, MsgOu
 // TODO: Remove duplicated code in send_message
 template <typename MsgIn, typename Timeout, typename... MsgOut>
 auto send_message_timeout(messaging_service* ms, messaging_verb verb, msg_addr id, Timeout timeout, MsgOut&&... msg) {
-    auto rpc_client_ptr = ms->get_rpc_client(verb, id);
    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
+    if (ms->is_stopping()) {
+        using futurator = futurize<std::result_of_t<decltype(rpc_handler)(rpc_protocol::client&, MsgOut...)>>;
+        return futurator::make_exception_future(rpc::closed_error());
+    }
+    auto rpc_client_ptr = ms->get_rpc_client(verb, id);
    auto& rpc_client = *rpc_client_ptr;
    return rpc_handler(rpc_client, timeout, std::forward<MsgOut>(msg)...).then_wrapped([ms = ms->shared_from_this(), id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (auto&& f) {
        try {
@@ -534,7 +550,7 @@ auto send_message_timeout_and_retry(messaging_service* ms, messaging_verb verb,
                    throw;
                }
            });
-        }).then([] (MsgInTuple result) {
+        }).then([ms = ms->shared_from_this()] (MsgInTuple result) {
            return futurize<MsgIn>::from_tuple(std::move(result));
        });
    });
--- a/mutation.cc
+++ b/mutation.cc
@@ -126,16 +126,37 @@ bool mutation::operator!=(const mutation& m) const {
    return !(*this == m);
 }

+void
+mutation::query(query::result::builder& builder,
+    const query::partition_slice& slice,
+    gc_clock::time_point now,
+    uint32_t row_limit) &&
+{
+    auto pb = builder.add_partition(*schema(), key());
+    auto is_reversed = slice.options.contains<query::partition_slice::option::reversed>();
+    mutation_partition& p = partition();
+    p.compact_for_query(*schema(), now, slice.row_ranges(*schema(), key()), is_reversed, row_limit);
+    p.query_compacted(pb, *schema(), row_limit);
+}
+
 query::result
-mutation::query(const query::partition_slice& slice, query::result_request request,
-    gc_clock::time_point now, uint32_t row_limit) const
+mutation::query(const query::partition_slice& slice,
+    query::result_request request,
+    gc_clock::time_point now, uint32_t row_limit) &&
 {
    query::result::builder builder(slice, request);
-    auto pb = builder.add_partition(*schema(), key());
-    partition().query(pb, *schema(), now, row_limit);
+    std::move(*this).query(builder, slice, now, row_limit);
    return builder.build();
 }

+query::result
+mutation::query(const query::partition_slice& slice,
+    query::result_request request,
+    gc_clock::time_point now, uint32_t row_limit) const&
+{
+    return mutation(*this).query(slice, request, now, row_limit);
+}
+
 size_t
 mutation::live_row_count(gc_clock::time_point query_time) const {
    return partition().live_row_count(*schema(), query_time);
@@ -186,3 +207,7 @@ void mutation::apply(mutation&& m) {
 void mutation::apply(const mutation& m) {
    partition().apply(*schema(), m.partition(), *m.schema());
 }
+
+mutation& mutation::operator=(const mutation& m) {
+    return *this = mutation(m);
+}
--- a/mutation.hh
+++ b/mutation.hh
@@ -60,9 +60,9 @@ public:
    mutation(const mutation& m)
        : _ptr(std::make_unique<data>(schema_ptr(m.schema()), dht::decorated_key(m.decorated_key()), m.partition()))
    { }
-
    mutation(mutation&&) = default;
    mutation& operator=(mutation&& x) = default;
+    mutation& operator=(const mutation& m);

    void set_static_cell(const column_definition& def, atomic_cell_or_collection&& value);
    void set_static_cell(const bytes& name, const data_value& value, api::timestamp_type timestamp, ttl_opt ttl = {});
@@ -104,8 +104,23 @@ public:
    bool operator!=(const mutation&) const;
 public:
    // The supplied partition_slice must be governed by this mutation's schema
-    query::result query(const query::partition_slice&, query::result_request request = query::result_request::only_result,
-        gc_clock::time_point now = gc_clock::now(), uint32_t row_limit = query::max_rows) const;
+    query::result query(const query::partition_slice&,
+        query::result_request request = query::result_request::only_result,
+        gc_clock::time_point now = gc_clock::now(),
+        uint32_t row_limit = query::max_rows) &&;
+
+    // The supplied partition_slice must be governed by this mutation's schema
+    // FIXME: Slower than the r-value version
+    query::result query(const query::partition_slice&,
+        query::result_request request = query::result_request::only_result,
+        gc_clock::time_point now = gc_clock::now(),
+        uint32_t row_limit = query::max_rows) const&;
+
+    // The supplied partition_slice must be governed by this mutation's schema
+    void query(query::result::builder& builder,
+        const query::partition_slice& slice,
+        gc_clock::time_point now = gc_clock::now(),
+        uint32_t row_limit = query::max_rows) &&;

    // See mutation_partition::live_row_count()
    size_t live_row_count(gc_clock::time_point query_time = gc_clock::time_point::min()) const;
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -20,12 +20,14 @@
 */

 #include <boost/range/adaptor/reversed.hpp>
+#include <seastar/util/defer.hh>
 #include "mutation_partition.hh"
 #include "mutation_partition_applier.hh"
 #include "converting_mutation_partition_applier.hh"
 #include "partition_builder.hh"
 #include "query-result-writer.hh"
 #include "atomic_cell_hash.hh"
+#include "reversibly_mergeable.hh"

 template<bool reversed>
 struct reversal_traits;
@@ -57,6 +59,11 @@ struct reversal_traits<false> {
    {
        return r;
    }
+
+    template <typename Container>
+    static typename Container::iterator maybe_reverse(Container&, typename Container::iterator r) {
+        return r;
+    }
 };

 template<>
@@ -89,8 +96,116 @@ struct reversal_traits<true> {
        using reverse_iterator = typename Container::reverse_iterator;
        return boost::make_iterator_range(reverse_iterator(r.end()), reverse_iterator(r.begin()));
    }
+
+    template <typename Container>
+    static typename Container::reverse_iterator maybe_reverse(Container&, typename Container::iterator r) {
+        return typename Container::reverse_iterator(r);
+    }
 };

+
+//
+// apply_reversibly_intrusive_set() and revert_intrusive_set() implement ReversiblyMergeable
+// for a boost::intrusive_set<> container of ReversiblyMergeable entries.
+//
+// See reversibly_mergeable.hh
+//
+// Requirements:
+//  - entry has distinct key and value states
+//  - entries are ordered only by key in the container
+//  - entry can have an empty value
+//  - presence of an entry with an empty value doesn't affect equality of the containers
+//  - E::empty() returns true iff the value is empty
+//  - E(e.key()) creates an entry with empty value but the same key as that of e.
+//
+// Implementation of ReversiblyMergeable for the entry's value is provided via Apply and Revert functors.
+//
+// ReversiblyMergeable is constructed assuming the following properties of the 'apply' operation
+// on containers:
+//
+//  apply([{k1, v1}], [{k1, v2}]) = [{k1, apply(v1, v2)}]
+//  apply([{k1, v1}], [{k2, v2}]) = [{k1, v1}, {k2, v2}]
+//
+
+// revert for apply_reversibly_intrusive_set()
+template<typename Container, typename Revert = default_reverter<typename Container::value_type>>
+void revert_intrusive_set_range(Container& dst, Container& src,
+    typename Container::iterator start,
+    typename Container::iterator end,
+    Revert&& revert = Revert()) noexcept
+{
+    using value_type = typename Container::value_type;
+    auto deleter = current_deleter<value_type>();
+    while (start != end) {
+        auto& e = *start;
+        // lower_bound() can allocate if linearization is required but it should have
+        // been already performed by the lower_bound() invocation in apply_reversibly_intrusive_set() and
+        // stored in the linearization context.
+        auto i = dst.find(e);
+        assert(i != dst.end());
+        value_type& dst_e = *i;
+
+        if (e.empty()) {
+            dst.erase(i);
+            start = src.erase_and_dispose(start, deleter);
+            start = src.insert_before(start, dst_e);
+        } else {
+            revert(dst_e, e);
+        }
+
+        ++start;
+    }
+}
+
+template<typename Container, typename Revert = default_reverter<typename Container::value_type>>
+void revert_intrusive_set(Container& dst, Container& src, Revert&& revert = Revert()) noexcept {
+    revert_intrusive_set_range(dst, src, src.begin(), src.end(), std::forward<Revert>(revert));
+}
+
+// Applies src onto dst. See comment above revert_intrusive_set_range() for more details.
+//
+// Returns an object which upon going out of scope, unless cancel() is called on it,
+// reverts the applicaiton by calling revert_intrusive_set(). The references to containers
+// must be stable as long as the returned object is live.
+template<typename Container,
+        typename Apply = default_reversible_applier<typename Container::value_type>,
+        typename Revert = default_reverter<typename Container::value_type>>
+auto apply_reversibly_intrusive_set(Container& dst, Container& src, Apply&& apply = Apply(), Revert&& revert = Revert()) {
+    using value_type = typename Container::value_type;
+    auto src_i = src.begin();
+    try {
+        while (src_i != src.end()) {
+            value_type& src_e = *src_i;
+
+            // neutral entries will be given special meaning for the purpose of revert, so
+            // get rid of empty rows from the input as if they were not there. This doesn't change
+            // the value of src.
+            if (src_e.empty()) {
+                src_i = src.erase_and_dispose(src_i, current_deleter<value_type>());
+                continue;
+            }
+
+            auto i = dst.lower_bound(src_e);
+            if (i == dst.end() || dst.key_comp()(src_e, *i)) {
+                // Construct neutral entry which will represent missing dst entry for revert.
+                value_type* empty_e = current_allocator().construct<value_type>(src_e.key());
+                [&] () noexcept {
+                    src_i = src.erase(src_i);
+                    src_i = src.insert_before(src_i, *empty_e);
+                    dst.insert_before(i, src_e);
+                }();
+            } else {
+                apply(*i, src_e);
+            }
+            ++src_i;
+        }
+        return defer([&dst, &src, revert] { revert_intrusive_set(dst, src, revert); });
+    } catch (...) {
+        revert_intrusive_set_range(dst, src, src.begin(), src_i, revert);
+        throw;
+    }
+}
+
 mutation_partition::mutation_partition(const mutation_partition& x)
        : _tombstone(x._tombstone)
        , _static_row(x._static_row)
@@ -134,29 +249,12 @@ mutation_partition::apply(const schema& s, const mutation_partition& p, const sc
    if (s.version() != p_schema.version()) {
        auto p2 = p;
        p2.upgrade(p_schema, s);
-        apply(s, std::move(p2), s);
+        apply(s, std::move(p2));
        return;
    }

-    _tombstone.apply(p._tombstone);
-
-    for (auto&& e : p._row_tombstones) {
-        apply_row_tombstone(s, e.prefix(), e.t());
-    }
-
-    _static_row.merge(s, column_kind::static_column, p._static_row);
-
-    for (auto&& entry : p._rows) {
-        auto i = _rows.find(entry);
-        if (i == _rows.end()) {
-            auto e = current_allocator().construct<rows_entry>(entry);
-            _rows.insert(i, *e);
-        } else {
-            i->row().apply(entry.row().deleted_at());
-            i->row().apply(entry.row().marker());
-            i->row().cells().merge(s, column_kind::regular_column, entry.row().cells());
-        }
-    }
+    mutation_partition tmp(p);
+    apply(s, std::move(tmp));
 }

 void
@@ -167,42 +265,42 @@ mutation_partition::apply(const schema& s, mutation_partition&& p, const schema&
        return;
    }

-    _tombstone.apply(p._tombstone);
+    apply(s, std::move(p));
+}

-    p._row_tombstones.clear_and_dispose([this, &s] (row_tombstones_entry* e) {
-        apply_row_tombstone(s, e);
+void
+mutation_partition::apply(const schema& s, mutation_partition&& p) {
+    auto revert_row_tombstones = apply_reversibly_intrusive_set(_row_tombstones, p._row_tombstones);
+
+    _static_row.apply_reversibly(s, column_kind::static_column, p._static_row);
+    auto revert_static_row = defer([&] {
+        _static_row.revert(s, column_kind::static_column, p._static_row);
    });

-    _static_row.merge(s, column_kind::static_column, std::move(p._static_row));
+    auto revert_rows = apply_reversibly_intrusive_set(_rows, p._rows,
+        [&s] (rows_entry& dst, rows_entry& src) { dst.apply_reversibly(s, src); },
+        [&s] (rows_entry& dst, rows_entry& src) noexcept { dst.revert(s, src); });

-    auto p_i = p._rows.begin();
-    auto p_end = p._rows.end();
-    while (p_i != p_end) {
-        rows_entry& entry = *p_i;
-        auto i = _rows.find(entry);
-        if (i == _rows.end()) {
-            p_i = p._rows.erase(p_i);
-            _rows.insert(i, entry);
-        } else {
-            i->row().apply(entry.row().deleted_at());
-            i->row().apply(entry.row().marker());
-            i->row().cells().merge(s, column_kind::regular_column, std::move(entry.row().cells()));
-            p_i = p._rows.erase_and_dispose(p_i, current_deleter<rows_entry>());
-        }
-    }
+    _tombstone.apply(p._tombstone); // noexcept
+
+    revert_rows.cancel();
+    revert_row_tombstones.cancel();
+    revert_static_row.cancel();
 }

 void
 mutation_partition::apply(const schema& s, mutation_partition_view p, const schema& p_schema) {
    if (p_schema.version() == s.version()) {
-        mutation_partition_applier applier(s, *this);
-        p.accept(s, applier);
+        mutation_partition p2(*this, copy_comparators_only{});
+        partition_builder b(s, p2);
+        p.accept(s, b);
+        apply(s, std::move(p2));
    } else {
        mutation_partition p2(*this, copy_comparators_only{});
        partition_builder b(p_schema, p2);
        p.accept(p_schema, b);
        p2.upgrade(p_schema, s);
-        apply(s, std::move(p2), s);
+        apply(s, std::move(p2));
    }
 }

@@ -350,16 +448,25 @@ mutation_partition::clustered_row(const schema& s, const clustering_key_view& ke
    return i->row();
 }

-boost::iterator_range<mutation_partition::rows_type::const_iterator>
-mutation_partition::range(const schema& schema, const query::range<clustering_key_prefix>& r) const {
+mutation_partition::rows_type::const_iterator
+mutation_partition::lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const {
    auto cmp = rows_entry::key_comparator(clustering_key_prefix::prefix_equality_less_compare(schema));
-    auto i1 = r.start() ? (r.start()->is_inclusive()
+    return r.start() ? (r.start()->is_inclusive()
            ? _rows.lower_bound(r.start()->value(), cmp)
            : _rows.upper_bound(r.start()->value(), cmp)) : _rows.cbegin();
-    auto i2 = r.end() ? (r.end()->is_inclusive()
-            ? _rows.upper_bound(r.end()->value(), cmp)
-            : _rows.lower_bound(r.end()->value(), cmp)) : _rows.cend();
-    return boost::make_iterator_range(i1, i2);
+}
+
+mutation_partition::rows_type::const_iterator
+mutation_partition::upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const {
+    auto cmp = rows_entry::key_comparator(clustering_key_prefix::prefix_equality_less_compare(schema));
+    return r.end() ? (r.end()->is_inclusive()
+                         ? _rows.upper_bound(r.end()->value(), cmp)
+                         : _rows.lower_bound(r.end()->value(), cmp)) : _rows.cend();
+}
+
+boost::iterator_range<mutation_partition::rows_type::const_iterator>
+mutation_partition::range(const schema& schema, const query::range<clustering_key_prefix>& r) const {
+    return boost::make_iterator_range(lower_bound(schema, r), upper_bound(schema, r));
 }

 template <typename Container>
@@ -371,11 +478,27 @@ unconst(Container& c, boost::iterator_range<typename Container::const_iterator>
    );
 }

+template <typename Container>
+typename Container::iterator
+unconst(Container& c, typename Container::const_iterator i) {
+    return c.erase(i, i);
+}
+
 boost::iterator_range<mutation_partition::rows_type::iterator>
 mutation_partition::range(const schema& schema, const query::range<clustering_key_prefix>& r) {
    return unconst(_rows, static_cast<const mutation_partition*>(this)->range(schema, r));
 }

+mutation_partition::rows_type::iterator
+mutation_partition::lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) {
+    return unconst(_rows, static_cast<const mutation_partition*>(this)->lower_bound(schema, r));
+}
+
+mutation_partition::rows_type::iterator
+mutation_partition::upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) {
+    return unconst(_rows, static_cast<const mutation_partition*>(this)->upper_bound(schema, r));
+}
+
 template<typename Func>
 void mutation_partition::for_each_row(const schema& schema, const query::range<clustering_key_prefix>& row_range, bool reversed, Func&& func) const
 {
@@ -450,13 +573,11 @@ static void hash_row_slice(md5_hasher& hasher,
 }

 template<typename RowWriter>
-static void get_row_slice(const schema& s,
+static void get_compacted_row_slice(const schema& s,
    const query::partition_slice& slice,
    column_kind kind,
    const row& cells,
    const std::vector<column_id>& columns,
-    tombstone tomb,
-    gc_clock::time_point now,
    RowWriter& writer)
 {
    for (auto id : columns) {
@@ -467,7 +588,7 @@ static void get_row_slice(const schema& s,
            auto&& def = s.column_at(kind, id);
            if (def.is_atomic()) {
                auto c = cell->as_atomic_cell();
-                if (!c.is_live(tomb, now)) {
+                if (!c.is_live()) {
                    writer.add().skip();
                } else {
                    write_cell(writer, slice, cell->as_atomic_cell());
@@ -475,21 +596,18 @@ static void get_row_slice(const schema& s,
            } else {
                auto&& mut = cell->as_collection_mutation();
                auto&& ctype = static_pointer_cast<const collection_type_impl>(def.type);
-                auto m_view = ctype->deserialize_mutation_form(mut);
-                m_view.tomb.apply(tomb);
-                // FIXME: Instead of this, write optimistically and retract if empty
-                auto m_ser = ctype->serialize_mutation_form_only_live(m_view, now);
-                if (ctype->is_empty(m_ser)) {
+                if (!ctype->is_any_live(mut)) {
                    writer.add().skip();
                } else {
-                    write_cell(writer, slice, def.type, m_ser);
+                    write_cell(writer, slice, def.type, mut);
                }
            }
        }
    }
 }

-bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tombstone tomb, gc_clock::time_point now) {
+bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tombstone tomb = tombstone(),
+                       gc_clock::time_point now = gc_clock::time_point::min()) {
    bool any_live = false;
    cells.for_each_cell_until([&] (column_id id, const atomic_cell_or_collection& cell_or_collection) {
        const column_definition& def = s.column_at(kind, id);
@@ -512,25 +630,27 @@ bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tomb
    return any_live;
 }

-uint32_t
-mutation_partition::query(query::result::partition_writer& pw,
-    const schema& s,
-    gc_clock::time_point now,
-    uint32_t limit) const
-{
+static bool has_ck_selector(const query::clustering_row_ranges& ranges) {
+    // Like PK range, an empty row range, should be considered an "exclude all" restriction
+    return ranges.empty() || std::any_of(ranges.begin(), ranges.end(), [](auto& r) {
+        return !r.is_full();
+    });
+}
+
+void
+mutation_partition::query_compacted(query::result::partition_writer& pw, const schema& s, uint32_t limit) const {
    const query::partition_slice& slice = pw.slice();

    if (limit == 0) {
        pw.retract();
-        return 0;
+        return;
    }

    auto static_cells_wr = pw.start().start_static_row().start_cells();

    if (!slice.static_columns.empty()) {
        if (pw.requested_result()) {
-            get_row_slice(s, slice, column_kind::static_column, static_row(), slice.static_columns, partition_tombstone(),
-                          now, static_cells_wr);
+            get_compacted_row_slice(s, slice, column_kind::static_column, static_row(), slice.static_columns, static_cells_wr);
        }
        if (pw.requested_digest()) {
            ::feed_hash(pw.digest(), partition_tombstone());
@@ -544,52 +664,37 @@ mutation_partition::query(query::result::partition_writer& pw,

    uint32_t row_count = 0;

-    // Like PK range, an empty row range, should be considered an "exclude all" restriction
-    bool has_ck_selector = pw.ranges().empty();
-
    auto is_reversed = slice.options.contains(query::partition_slice::option::reversed);
    auto send_ck = slice.options.contains(query::partition_slice::option::send_clustering_key);
-    for (auto&& row_range : pw.ranges()) {
-        if (limit == 0) {
-            break;
+    for_each_row(s, query::clustering_range::make_open_ended_both_sides(), is_reversed, [&] (const rows_entry& e) {
+        auto& row = e.row();
+        auto row_tombstone = tombstone_for_row(s, e);
+
+        if (pw.requested_digest()) {
+            e.key().feed_hash(pw.digest(), s);
+            ::feed_hash(pw.digest(), row_tombstone);
+            hash_row_slice(pw.digest(), s, column_kind::regular_column, row.cells(), slice.regular_columns);
        }

-        has_ck_selector |= !row_range.is_full();
-
-        // FIXME: Optimize for a full-tuple singular range. mutation_partition::range()
-        // does two lookups to form a range, even for singular range. We need
-        // only one lookup for a full-tuple singular range though.
-        for_each_row(s, row_range, is_reversed, [&] (const rows_entry& e) {
-            auto& row = e.row();
-            auto row_tombstone = tombstone_for_row(s, e);
-
-            if (pw.requested_digest()) {
-                e.key().feed_hash(pw.digest(), s);
-                ::feed_hash(pw.digest(), row_tombstone);
-                hash_row_slice(pw.digest(), s, column_kind::regular_column, row.cells(), slice.regular_columns);
+        if (row.is_live(s)) {
+            if (pw.requested_result()) {
+                auto cells_wr = [&] {
+                    if (send_ck) {
+                        return rows_wr.add().write_key(e.key()).start_cells().start_cells();
+                    } else {
+                        return rows_wr.add().skip_key().start_cells().start_cells();
+                    }
+                }();
+                get_compacted_row_slice(s, slice, column_kind::regular_column, row.cells(), slice.regular_columns, cells_wr);
+                std::move(cells_wr).end_cells().end_cells().end_qr_clustered_row();
            }
-
-            if (row.is_live(s, row_tombstone, now)) {
-                if (pw.requested_result()) {
-                    auto cells_wr = [&] {
-                        if (send_ck) {
-                            return rows_wr.add().write_key(e.key()).start_cells().start_cells();
-                        } else {
-                            return rows_wr.add().skip_key().start_cells().start_cells();
-                        }
-                    }();
-                    get_row_slice(s, slice, column_kind::regular_column, row.cells(), slice.regular_columns, row_tombstone,
-                                  now, cells_wr);
-                    std::move(cells_wr).end_cells().end_cells().end_qr_clustered_row();
-                }
-                ++row_count;
-                if (--limit == 0) {
-                    return stop_iteration::yes;
-                }
+            ++row_count;
+            if (--limit == 0) {
+                return stop_iteration::yes;
            }
-            return stop_iteration::no;
-        });
-    }
+        }
+        return stop_iteration::no;
+    });

    // If we got no rows, but have live static columns, we should only
    // give them back IFF we did not have any CK restrictions.
@@ -597,17 +702,11 @@ mutation_partition::query(query::result::partition_writer& pw,
    // If ck:s exist, and we do a restriction on them, we either have maching
    // rows, or return nothing, since cql does not allow "is null".
    if (row_count == 0
-			&& (has_ck_selector
-					|| !has_any_live_data(s, column_kind::static_column,
-							static_row(), _tombstone, now))) {
+			&& (has_ck_selector(pw.ranges())
+					|| !has_any_live_data(s, column_kind::static_column, static_row()))) {
 		pw.retract();
-        return 0;
 	} else {
        std::move(rows_wr).end_rows().end_qr_partition();
-
-        // The partition is live. If there are no clustered rows, there
-        // must be something live in the static row, which counts as one row.
-        return std::max<uint32_t>(row_count, 1);
 	}
 }

@@ -667,7 +766,7 @@ operator<<(std::ostream& os, const mutation_partition& mp) {
 constexpr gc_clock::duration row_marker::no_ttl;
 constexpr gc_clock::duration row_marker::dead;

-int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) {
+int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept {
    if (left.timestamp() != right.timestamp()) {
        return left.timestamp() > right.timestamp() ? 1 : -1;
    }
@@ -703,6 +802,18 @@ deletable_row::equal(column_kind kind, const schema& s, const deletable_row& oth
    return _cells.equal(kind, s, other._cells, other_schema);
 }

+void deletable_row::apply_reversibly(const schema& s, deletable_row& src) {
+    _cells.apply_reversibly(s, column_kind::regular_column, src._cells);
+    _deleted_at.apply_reversibly(src._deleted_at); // noexcept
+    _marker.apply_reversibly(src._marker); // noexcept
+}
+
+void deletable_row::revert(const schema& s, deletable_row& src) {
+    _cells.revert(s, column_kind::regular_column, src._cells);
+    _deleted_at.revert(src._deleted_at);
+    _marker.revert(src._marker);
+}
+
 bool
 rows_entry::equal(const schema& s, const rows_entry& other) const {
    return equal(s, other, s);
@@ -747,42 +858,123 @@ bool mutation_partition::equal(const schema& this_schema, const mutation_partiti
 }

 void
-merge_column(const column_definition& def,
-             atomic_cell_or_collection& old,
-             atomic_cell_or_collection&& neww) {
+apply_reversibly(const column_definition& def, atomic_cell_or_collection& dst,  atomic_cell_or_collection& src) {
    // Must be run via with_linearized_managed_bytes() context, but assume it is
    // provided via an upper layer
    if (def.is_atomic()) {
-        if (compare_atomic_cell_for_merge(old.as_atomic_cell(), neww.as_atomic_cell()) < 0) {
-            old = std::move(neww);
+        auto&& src_ac = src.as_atomic_cell_ref();
+        if (compare_atomic_cell_for_merge(dst.as_atomic_cell(), src.as_atomic_cell()) < 0) {
+            std::swap(dst, src);
+            src_ac.set_revert(true);
+        } else {
+            src_ac.set_revert(false);
        }
    } else {
        auto ct = static_pointer_cast<const collection_type_impl>(def.type);
-        old = ct->merge(old.as_collection_mutation(), neww.as_collection_mutation());
+        src = ct->merge(dst.as_collection_mutation(), src.as_collection_mutation());
+        std::swap(dst, src);
+    }
+}
+
+void
+revert(const column_definition& def, atomic_cell_or_collection& dst, atomic_cell_or_collection& src) noexcept {
+    static_assert(std::is_nothrow_move_constructible<atomic_cell_or_collection>::value
+                  && std::is_nothrow_move_assignable<atomic_cell_or_collection>::value,
+                  "for std::swap() to be noexcept");
+    if (def.is_atomic()) {
+        auto&& ac = src.as_atomic_cell_ref();
+        if (ac.is_revert_set()) {
+            ac.set_revert(false);
+            std::swap(dst, src);
+        }
+    } else {
+        std::swap(dst, src);
    }
 }

 void
 row::apply(const column_definition& column, const atomic_cell_or_collection& value) {
-    // FIXME: Optimize
    atomic_cell_or_collection tmp(value);
    apply(column, std::move(tmp));
 }

 void
 row::apply(const column_definition& column, atomic_cell_or_collection&& value) {
+    apply_reversibly(column, value);
+}
+
+template<typename Func, typename Rollback>
+void row::for_each_cell(Func&& func, Rollback&& rollback) {
+    static_assert(noexcept(rollback(std::declval<column_id>(), std::declval<atomic_cell_or_collection&>())),
+                           "rollback must be noexcept");
+
+    if (_type == storage_type::vector) {
+        unsigned i = 0;
+        try {
+            for (; i < _storage.vector.v.size(); i++) {
+                if (_storage.vector.present.test(i)) {
+                    func(i, _storage.vector.v[i]);
+                }
+            }
+        } catch (...) {
+            while (i) {
+                --i;
+                if (_storage.vector.present.test(i)) {
+                    rollback(i, _storage.vector.v[i]);
+                }
+            }
+            throw;
+        }
+    } else {
+        auto i = _storage.set.begin();
+        try {
+            while (i != _storage.set.end()) {
+                func(i->id(), i->cell());
+                ++i;
+            }
+        } catch (...) {
+            while (i != _storage.set.begin()) {
+                --i;
+                rollback(i->id(), i->cell());
+            }
+            throw;
+        }
+    }
+}
+
+template<typename Func>
+void row::for_each_cell(Func&& func) {
+    if (_type == storage_type::vector) {
+        for (auto i : bitsets::for_each_set(_storage.vector.present)) {
+            func(i, _storage.vector.v[i]);
+        }
+    } else {
+        for (auto& cell : _storage.set) {
+            func(cell.id(), cell.cell());
+        }
+    }
+}
+
+void
+row::apply_reversibly(const column_definition& column, atomic_cell_or_collection& value) {
+    static_assert(std::is_nothrow_move_constructible<atomic_cell_or_collection>::value
+                  && std::is_nothrow_move_assignable<atomic_cell_or_collection>::value,
+                  "noexcept required for atomicity");
+
    // our mutations are not yet immutable
    auto id = column.id;
    if (_type == storage_type::vector && id < max_vector_size) {
-        if (id >= _storage.vector.size()) {
-            _storage.vector.resize(id);
-            _storage.vector.emplace_back(std::move(value));
+        if (id >= _storage.vector.v.size()) {
+            _storage.vector.v.resize(id);
+            _storage.vector.v.emplace_back(std::move(value));
+            _storage.vector.present.set(id);
            _size++;
-        } else if (!bool(_storage.vector[id])) {
-            _storage.vector[id] = std::move(value);
+        } else if (!bool(_storage.vector.v[id])) {
+            _storage.vector.v[id] = std::move(value);
+            _storage.vector.present.set(id);
            _size++;
        } else {
-            merge_column(column, _storage.vector[id], std::move(value));
+            ::apply_reversibly(column, _storage.vector.v[id], value);
        }
    } else {
        if (_type == storage_type::vector) {
@@ -790,11 +982,37 @@ row::apply(const column_definition& column, atomic_cell_or_collection&& value) {
        }
        auto i = _storage.set.lower_bound(id, cell_entry::compare());
        if (i == _storage.set.end() || i->id() != id) {
-            auto e = current_allocator().construct<cell_entry>(id, std::move(value));
+            cell_entry* e = current_allocator().construct<cell_entry>(id);
+            std::swap(e->_cell, value);
            _storage.set.insert(i, *e);
            _size++;
        } else {
-            merge_column(column, i->cell(), std::move(value));
+            ::apply_reversibly(column, i->cell(), value);
+        }
+    }
+}
+
+void
+row::revert(const column_definition& column, atomic_cell_or_collection& src) noexcept {
+    auto id = column.id;
+    if (_type == storage_type::vector) {
+        auto& dst = _storage.vector.v[id];
+        if (!src) {
+            std::swap(dst, src);
+            _storage.vector.present.reset(id);
+            --_size;
+        } else {
+            ::revert(column, dst, src);
+        }
+    } else {
+        auto i = _storage.set.find(id, cell_entry::compare());
+        auto& dst = i->cell();
+        if (!src) {
+            std::swap(dst, src);
+            _storage.set.erase_and_dispose(i, current_deleter<cell_entry>());
+            --_size;
+        } else {
+            ::revert(column, dst, src);
        }
    }
 }
@@ -802,8 +1020,9 @@ row::apply(const column_definition& column, atomic_cell_or_collection&& value) {
 void
 row::append_cell(column_id id, atomic_cell_or_collection value) {
    if (_type == storage_type::vector && id < max_vector_size) {
-        _storage.vector.resize(id);
-        _storage.vector.emplace_back(std::move(value));
+        _storage.vector.v.resize(id);
+        _storage.vector.v.emplace_back(std::move(value));
+        _storage.vector.present.set(id);
    } else {
        if (_type == storage_type::vector) {
            vector_to_set();
@@ -817,10 +1036,10 @@ row::append_cell(column_id id, atomic_cell_or_collection value) {
 const atomic_cell_or_collection*
 row::find_cell(column_id id) const {
    if (_type == storage_type::vector) {
-        if (id >= _storage.vector.size() || !bool(_storage.vector[id])) {
+        if (id >= _storage.vector.v.size() || !_storage.vector.present.test(id)) {
            return nullptr;
        }
-        return &_storage.vector[id];
+        return &_storage.vector.v[id];
    } else {
        auto i = _storage.set.find(id, cell_entry::compare());
        if (i == _storage.set.end()) {
@@ -841,15 +1060,24 @@ void mutation_partition::trim_rows(const schema& s,
    auto last = reversal_traits<reversed>::begin(_rows);
    auto deleter = current_deleter<rows_entry>();

+    auto range_begin = [this, &s] (const query::clustering_range& range) {
+        return reversed ? upper_bound(s, range) : lower_bound(s, range);
+    };
+
+    auto range_end = [this, &s] (const query::clustering_range& range) {
+        return reversed ? lower_bound(s, range) : upper_bound(s, range);
+    };
+
    for (auto&& row_range : row_ranges) {
        if (stop) {
            break;
        }

-        auto it_range = reversal_traits<reversed>::maybe_reverse(_rows, range(s, row_range));
-        last = reversal_traits<reversed>::erase_and_dispose(_rows, last, it_range.begin(), deleter);
+        last = reversal_traits<reversed>::erase_and_dispose(_rows, last,
+            reversal_traits<reversed>::maybe_reverse(_rows, range_begin(row_range)), deleter);

-        while (last != it_range.end()) {
+        auto end = reversal_traits<reversed>::maybe_reverse(_rows, range_end(row_range));
+        while (last != end) {
            rows_entry& e = *last;
            if (func(e) == stop_iteration::yes) {
                stop = true;
@@ -921,10 +1149,7 @@ uint32_t mutation_partition::do_compact(const schema& s,

    // #589 - Do not add extra row for statics unless we did a CK range-less query.
    // See comment in query
-    if (row_count == 0 && static_row_live
-            && std::any_of(row_ranges.begin(), row_ranges.end(), [](auto& r) {
-                return r.is_full();
-            })) {
+    if (row_count == 0 && static_row_live && !has_ck_selector(row_ranges)) {
        ++row_count;
    }

@@ -977,7 +1202,7 @@ bool mutation_partition::empty() const
 }

 bool
-deletable_row::is_live(const schema& s, tombstone base_tombstone, gc_clock::time_point query_time = gc_clock::time_point::min()) const {
+deletable_row::is_live(const schema& s, tombstone base_tombstone, gc_clock::time_point query_time) const {
    // _created_at corresponds to the row marker cell, present for rows
    // created with the 'insert' statement. If row marker is live, we know the
    // row is live. Otherwise, a row is considered live if it has any cell
@@ -1034,7 +1259,7 @@ row::row(const row& o)
    , _size(o._size)
 {
    if (_type == storage_type::vector) {
-        new (&_storage.vector) vector_type(o._storage.vector);
+        new (&_storage.vector) vector_storage(o._storage.vector);
    } else {
        auto cloner = [] (const auto& x) {
            return current_allocator().construct<std::remove_const_t<std::remove_reference_t<decltype(x)>>>(x);
@@ -1051,14 +1276,14 @@ row::row(const row& o)

 row::~row() {
    if (_type == storage_type::vector) {
-        _storage.vector.~vector_type();
+        _storage.vector.~vector_storage();
    } else {
        _storage.set.clear_and_dispose(current_deleter<cell_entry>());
        _storage.set.~map_type();
    }
 }

-row::cell_entry::cell_entry(const cell_entry& o) noexcept
+row::cell_entry::cell_entry(const cell_entry& o)
    : _id(o._id)
    , _cell(o._cell)
 { }
@@ -1085,15 +1310,20 @@ void row::vector_to_set()
 {
    assert(_type == storage_type::vector);
    map_type set;
-    for (unsigned i = 0; i < _storage.vector.size(); i++) {
-        auto& c = _storage.vector[i];
-        if (!bool(c)) {
-            continue;
-        }
+    try {
+    for (auto i : bitsets::for_each_set(_storage.vector.present)) {
+        auto& c = _storage.vector.v[i];
        auto e = current_allocator().construct<cell_entry>(i, std::move(c));
        set.insert(set.end(), *e);
    }
-    _storage.vector.~vector_type();
+    } catch (...) {
+        set.clear_and_dispose([this, del = current_deleter<cell_entry>()] (cell_entry* ce) noexcept {
+            _storage.vector.v[ce->id()] = std::move(ce->cell());
+            del(ce);
+        });
+        throw;
+    }
+    _storage.vector.~vector_storage();
    new (&_storage.set) map_type(std::move(set));
    _type = storage_type::set;
 }
@@ -1104,7 +1334,7 @@ void row::reserve(column_id last_column)
        if (last_column >= max_vector_size) {
            vector_to_set();
        } else {
-            _storage.vector.reserve(last_column);
+            _storage.vector.v.reserve(last_column);
        }
    }
 }
@@ -1157,13 +1387,13 @@ bool row::equal(column_kind kind, const schema& this_schema, const row& other, c
 }

 row::row() {
-    new (&_storage.vector) vector_type;
+    new (&_storage.vector) vector_storage;
 }

 row::row(row&& other)
    : _type(other._type), _size(other._size) {
    if (_type == storage_type::vector) {
-        new (&_storage.vector) vector_type(std::move(other._storage.vector));
+        new (&_storage.vector) vector_storage(std::move(other._storage.vector));
    } else {
        new (&_storage.set) map_type(std::move(other._storage.set));
    }
@@ -1177,27 +1407,25 @@ row& row::operator=(row&& other) {
    return *this;
 }

-void row::merge(const schema& s, column_kind kind, const row& other) {
+void row::apply_reversibly(const schema& s, column_kind kind, row& other) {
+    if (other.empty()) {
+        return;
+    }
    if (other._type == storage_type::vector) {
-        reserve(other._storage.vector.size() - 1);
+        reserve(other._storage.vector.v.size() - 1);
    } else {
        reserve(other._storage.set.rbegin()->id());
    }
-    other.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
-        apply(s.column_at(kind, id), cell);
+    other.for_each_cell([&] (column_id id, atomic_cell_or_collection& cell) {
+        apply_reversibly(s.column_at(kind, id), cell);
+    }, [&] (column_id id, atomic_cell_or_collection& cell) noexcept {
+        revert(s.column_at(kind, id), cell);
    });
 }

-void row::merge(const schema& s, column_kind kind, row&& other) {
-    if (other._type == storage_type::vector) {
-        reserve(other._storage.vector.size() - 1);
-    } else {
-        reserve(other._storage.set.rbegin()->id());
-    }
-    // FIXME: Optimize when 'other' is a set. We could move whole entries, not only cells.
-    other.for_each_cell_until([&] (column_id id, atomic_cell_or_collection& cell) {
-        apply(s.column_at(kind, id), std::move(cell));
-        return stop_iteration::no;
+void row::revert(const schema& s, column_kind kind, row& other) noexcept {
+    other.for_each_cell([&] (column_id id, atomic_cell_or_collection& cell) noexcept {
+        revert(s.column_at(kind, id), cell);
    });
 }

@@ -1348,3 +1576,15 @@ mutation_partition::upgrade(const schema& old_schema, const schema& new_schema)
    accept(old_schema, v);
    *this = std::move(tmp);
 }
+
+void row_marker::apply_reversibly(row_marker& rm) noexcept {
+    if (compare_row_marker_for_merge(*this, rm) < 0) {
+        std::swap(*this, rm);
+    } else {
+        rm = *this;
+    }
+}
+
+void row_marker::revert(row_marker& rm) noexcept {
+    std::swap(*this, rm);
+}
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -28,6 +28,8 @@
 #include <boost/range/adaptor/indexed.hpp>
 #include <boost/range/adaptor/filtered.hpp>

+#include <seastar/core/bitset-iter.hh>
+
 #include "schema.hh"
 #include "tombstone.hh"
 #include "keys.hh"
@@ -58,8 +60,11 @@ class row {
            : _id(id)
            , _cell(std::move(cell))
        { }
+        cell_entry(column_id id)
+            : _id(id)
+        { }
        cell_entry(cell_entry&&) noexcept;
-        cell_entry(const cell_entry&) noexcept;
+        cell_entry(const cell_entry&);

        column_id id() const { return _id; }
        const atomic_cell_or_collection& cell() const { return _cell; }
@@ -96,11 +101,16 @@ public:
 private:
    using vector_type = managed_vector<atomic_cell_or_collection, internal_count, size_type>;

+    struct vector_storage {
+        std::bitset<max_vector_size> present;
+        vector_type v;
+    };
+
    union storage {
        storage() { }
        ~storage() { }
        map_type set;
-        vector_type vector;
+        vector_storage vector;
    } _storage;
 public:
    row();
@@ -109,6 +119,7 @@ public:
    row(row&& other);
    row& operator=(row&& other);
    size_t size() const { return _size; }
+    bool empty() const { return _size == 0; }

    void reserve(column_id);

@@ -120,13 +131,14 @@ private:
    template<typename Func>
    void remove_if(Func&& func) {
        if (_type == storage_type::vector) {
-            for (unsigned i = 0; i < _storage.vector.size(); i++) {
-                auto& c = _storage.vector[i];
-                if (!bool(c)) {
+            for (unsigned i = 0; i < _storage.vector.v.size(); i++) {
+                if (!_storage.vector.present.test(i)) {
                    continue;
                }
+                auto& c = _storage.vector.v[i];
                if (func(i, c)) {
                    c = atomic_cell_or_collection();
+                    _storage.vector.present.reset(i);
                    _size--;
                }
            }
@@ -146,11 +158,12 @@ private:

 private:
    auto get_range_vector() const {
-        auto range = boost::make_iterator_range(_storage.vector.begin(), _storage.vector.end());
-        return range | boost::adaptors::filtered([] (const atomic_cell_or_collection& c) { return bool(c); })
-               | boost::adaptors::transformed([this] (const atomic_cell_or_collection& c) {
-            auto id = &c - _storage.vector.data();
-            return std::pair<column_id, const atomic_cell_or_collection&>(id, std::cref(c));
+        auto id_range = boost::irange<column_id>(0, _storage.vector.v.size());
+        return boost::combine(id_range, _storage.vector.v)
+        | boost::adaptors::filtered([this] (const boost::tuple<const column_id&, const atomic_cell_or_collection&>& t) {
+            return _storage.vector.present.test(t.get<0>());
+        }) | boost::adaptors::transformed([] (const boost::tuple<const column_id&, const atomic_cell_or_collection&>& t) {
+            return std::pair<column_id, const atomic_cell_or_collection&>(t.get<0>(), t.get<1>());
        });
    }
    auto get_range_set() const {
@@ -163,7 +176,23 @@ private:
    auto with_both_ranges(const row& other, Func&& func) const;

    void vector_to_set();
+
+    // Calls Func(column_id, atomic_cell_or_collection&) for each cell in this row.
+    //
+    // Func() is allowed to modify the cell. Emptying a cell makes it still
+    // visible to for_each().
+    //
+    // In case of exception, calls Rollback(column_id, atomic_cell_or_collection&) on
+    // all cells on which Func() was successfully invoked in reverse order.
+    //
+    template<typename Func, typename Rollback>
+    void for_each_cell(Func&&, Rollback&&);
 public:
+    // Calls Func(column_id, atomic_cell_or_collection&) for each cell in this row.
+    // noexcept if Func doesn't throw.
+    template<typename Func>
+    void for_each_cell(Func&&);
+
    template<typename Func>
    void for_each_cell(Func&& func) const {
        for_each_cell_until([func = std::forward<Func>(func)] (column_id id, const atomic_cell_or_collection& c) {
@@ -175,11 +204,8 @@ public:
    template<typename Func>
    void for_each_cell_until(Func&& func) const {
        if (_type == storage_type::vector) {
-            for (unsigned i = 0; i < _storage.vector.size(); i++) {
-                auto& cell = _storage.vector[i];
-                if (!bool(cell)) {
-                    continue;
-                }
+            for (auto i : bitsets::for_each_set(_storage.vector.present)) {
+                auto& cell = _storage.vector.v[i];
                if (func(i, cell) == stop_iteration::yes) {
                    break;
                }
@@ -187,29 +213,7 @@ public:
        } else {
            for (auto& cell : _storage.set) {
                const auto& c = cell.cell();
-                if (c && func(cell.id(), c) == stop_iteration::yes) {
-                    break;
-                }
-            }
-        }
-    }
-
-    template<typename Func>
-    void for_each_cell_until(Func&& func) {
-        if (_type == storage_type::vector) {
-            for (unsigned i = 0; i < _storage.vector.size(); i++) {
-                auto& cell = _storage.vector[i];
-                if (!bool(cell)) {
-                    continue;
-                }
-                if (func(i, cell) == stop_iteration::yes) {
-                    break;
-                }
-            }
-        } else {
-            for (auto& cell : _storage.set) {
-                auto& c = cell.cell();
-                if (c && func(cell.id(), c) == stop_iteration::yes) {
+                if (func(cell.id(), c) == stop_iteration::yes) {
                    break;
                }
            }
@@ -222,21 +226,26 @@ public:
    //
    // Merges cell's value into the row.
    //
-    // In case of exception the current object and external object (moved-from)
-    // are both left in some valid states, such that they still will commute to
-    // a state the current object would have should the exception had not occurred.
+    // In case of exception the current object is left with a value equivalent to the original state.
+    //
+    // The external cell is left in a valid state, such that it will commute with
+    // current object to the same value should the exception had not occurred.
    //
    void apply(const column_definition& column, atomic_cell_or_collection&& cell);

+    // Equivalent to calling apply_reversibly() with a row containing only given cell.
+    // See reversibly_mergeable.hh
+    void apply_reversibly(const column_definition& column, atomic_cell_or_collection& cell);
+    // See reversibly_mergeable.hh
+    void revert(const column_definition& column, atomic_cell_or_collection& cell) noexcept;
+
    // Adds cell to the row. The column must not be already set.
    void append_cell(column_id id, atomic_cell_or_collection cell);

-    void merge(const schema& s, column_kind kind, const row& other);
-
-    // In case of exception the current object and external object (moved-from)
-    // are both left in some valid states, such that they still will commute to
-    // a state the current object would have should the exception had not occurred.
-    void merge(const schema& s, column_kind kind, row&& other);
+    // See reversibly_mergeable.hh
+    void apply_reversibly(const schema&, column_kind, row& src);
+    // See reversibly_mergeable.hh
+    void revert(const schema&, column_kind, row& src) noexcept;

    // Expires cells based on query_time. Expires tombstones based on gc_before
    // and max_purgeable. Removes cells covered by tomb.
@@ -258,7 +267,7 @@ public:
 std::ostream& operator<<(std::ostream& os, const std::pair<column_id, const atomic_cell_or_collection&>& c);

 class row_marker;
-int compare_row_marker_for_merge(const row_marker& left, const row_marker& right);
+int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept;

 class row_marker {
    static constexpr gc_clock::duration no_ttl { 0 };
@@ -321,6 +330,10 @@ public:
            *this = rm;
        }
    }
+    // See reversibly_mergeable.hh
+    void apply_reversibly(row_marker& rm) noexcept;
+    // See reversibly_mergeable.hh
+    void revert(row_marker& rm) noexcept;
    // Expires cells and tombstones. Removes items covered by higher level
    // tombstones.
    // Returns true if row marker is live.
@@ -398,6 +411,11 @@ public:
    void remove_tombstone() {
        _deleted_at = tombstone();
    }
+
+    // See reversibly_mergeable.hh
+    void apply_reversibly(const schema& s, deletable_row& src);
+    // See reversibly_mergeable.hh
+    void revert(const schema& s, deletable_row& src);
 public:
    tombstone deleted_at() const { return _deleted_at; }
    api::timestamp_type created_at() const { return _marker.timestamp(); }
@@ -407,7 +425,7 @@ public:
    row& cells() { return _cells; }
    friend std::ostream& operator<<(std::ostream& os, const deletable_row& dr);
    bool equal(column_kind, const schema& s, const deletable_row& other, const schema& other_schema) const;
-    bool is_live(const schema& s, tombstone base_tombstone, gc_clock::time_point query_time) const;
+    bool is_live(const schema& s, tombstone base_tombstone = tombstone(), gc_clock::time_point query_time = gc_clock::time_point::min()) const;
    bool empty() const { return !_deleted_at && _marker.is_missing() && !_cells.size(); }
    deletable_row difference(const schema&, column_kind, const deletable_row& other) const;
 };
@@ -422,6 +440,9 @@ public:
        : _prefix(std::move(prefix))
        , _t(std::move(t))
    { }
+    row_tombstones_entry(const clustering_key_prefix& prefix)
+        : _prefix(prefix)
+    { }
    row_tombstones_entry(row_tombstones_entry&& o) noexcept;
    row_tombstones_entry(const row_tombstones_entry&) = default;
    clustering_key_prefix& prefix() {
@@ -430,6 +451,9 @@ public:
    const clustering_key_prefix& prefix() const {
        return _prefix;
    }
+    const clustering_key_prefix& key() const {
+        return _prefix;
+    }
    tombstone& t() {
        return _t;
    }
@@ -439,6 +463,14 @@ public:
    void apply(tombstone t) {
        _t.apply(t);
    }
+    // See reversibly_mergeable.hh
+    void apply_reversibly(row_tombstones_entry& e) {
+        _t.apply_reversibly(e._t);
+    }
+    // See reversibly_mergeable.hh
+    void revert(row_tombstones_entry& e) noexcept {
+        _t.revert(e._t);
+    }
    struct compare {
        clustering_key_prefix::less_compare _c;
        compare(const schema& s) : _c(s) {}
@@ -472,6 +504,9 @@ public:

    friend std::ostream& operator<<(std::ostream& os, const row_tombstones_entry& rte);
    bool equal(const schema& s, const row_tombstones_entry& other) const;
+    bool empty() const {
+        return !_t;
+    }
 };

 class rows_entry {
@@ -512,6 +547,14 @@ public:
    void apply(tombstone t) {
        _row.apply(t);
    }
+    // See reversibly_mergeable.hh
+    void apply_reversibly(const schema& s, rows_entry& e) {
+        _row.apply_reversibly(s, e._row);
+    }
+    // See reversibly_mergeable.hh
+    void revert(const schema& s, rows_entry& e) noexcept {
+        _row.revert(s, e._row);
+    }
    bool empty() const {
        return _row.empty();
    }
@@ -570,8 +613,8 @@ class mutation_partition final {
    using row_tombstones_type = boost::intrusive::set<row_tombstones_entry,
        boost::intrusive::member_hook<row_tombstones_entry, boost::intrusive::set_member_hook<>, &row_tombstones_entry::_link>,
        boost::intrusive::compare<row_tombstones_entry::compare>>;
-    friend rows_entry;
-    friend row_tombstones_entry;
+    friend class rows_entry;
+    friend class row_tombstones_entry;
    friend class size_calculator;
 private:
    tombstone _tombstone;
@@ -626,19 +669,21 @@ public:
    // Commutative when this_schema == p_schema. If schemas differ, data in p which
    // is not representable in this_schema is dropped, thus apply() loses commutativity.
    //
-    // Basic exception guarantees. If apply() throws after being called in
-    // some entry state p0, the object is left in some consistent state p1 and
-    // it's possible that p1 != p0 + p. It holds though that p1 + p = p0 + p.
-    //
-    // FIXME: make stronger exception guarantees (p1 = p0).
+    // Strong exception guarantees.
    void apply(const schema& this_schema, const mutation_partition& p, const schema& p_schema);
    //
-    // Same guarantees as for apply(const schema&, const mutation_partition&).
+    // Applies p to current object.
    //
-    // In case of exception the current object and external object (moved-from)
-    // are both left in some valid states, such that they still will commute to
-    // a state the current object would have should the exception had not occurred.
+    // Commutative when this_schema == p_schema. If schemas differ, data in p which
+    // is not representable in this_schema is dropped, thus apply() loses commutativity.
+    //
+    // If exception is thrown, this object will be left in a state equivalent to the entry state
+    // and p will be left in a state which will commute with current object to the same value
+    // should the exception had not occurred.
    void apply(const schema& this_schema, mutation_partition&& p, const schema& p_schema);
+    // Use in case this instance and p share the same schema.
+    // Same guarantees as apply(const schema&, mutation_partition&&, const schema&);
+    void apply(const schema& s, mutation_partition&& p);
    // Same guarantees and constraints as for apply(const schema&, const mutation_partition&, const schema&).
    void apply(const schema& this_schema, mutation_partition_view p, const schema& p_schema);

@@ -717,9 +762,16 @@ public:
    tombstone tombstone_for_row(const schema& schema, const clustering_key& key) const;
    tombstone tombstone_for_row(const schema& schema, const rows_entry& e) const;
    boost::iterator_range<rows_type::const_iterator> range(const schema& schema, const query::range<clustering_key_prefix>& r) const;
+    rows_type::const_iterator lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const;
+    rows_type::const_iterator upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const;
+    rows_type::iterator lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r);
+    rows_type::iterator upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r);
    boost::iterator_range<rows_type::iterator> range(const schema& schema, const query::range<clustering_key_prefix>& r);
-    // Returns the number of live CQL rows written. No more than limit.
-    uint32_t query(query::result::partition_writer& pw, const schema& s, gc_clock::time_point now, uint32_t limit = query::max_rows) const;
+    // Writes this partition using supplied query result writer.
+    // The partition should be first compacted with compact_for_query(), otherwise
+    // results may include data which is deleted/expired.
+    // At most row_limit CQL rows will be written and digested.
+    void query_compacted(query::result::partition_writer& pw, const schema& s, uint32_t row_limit) const;
    void accept(const schema&, mutation_partition_visitor&) const;

    // Returns the number of live CQL rows in this partition.
--- a/mutation_query.cc
+++ b/mutation_query.cc
@@ -57,75 +57,96 @@ query::result
 to_data_query_result(const reconcilable_result& r, schema_ptr s, const query::partition_slice& slice) {
    query::result::builder builder(slice, query::result_request::only_result);
    for (const partition& p : r.partitions()) {
-        auto pb = builder.add_partition(*s, p._m.key(*s));
-        p.mut().unfreeze(s).partition().query(pb, *s, gc_clock::time_point::min(), query::max_rows);
+        p.mut().unfreeze(s).query(builder, slice, gc_clock::time_point::min(), query::max_rows);
    }
    return builder.build();
 }

+
+querying_reader::querying_reader(schema_ptr s,
+        const mutation_source& source,
+        const query::partition_range& range,
+        const query::partition_slice& slice,
+        uint32_t row_limit,
+        gc_clock::time_point query_time,
+        std::function<void(uint32_t, mutation&&)> consumer)
+    : _schema(std::move(s))
+    , _range(range)
+    , _slice(slice)
+    , _requested_limit(row_limit)
+    , _query_time(query_time)
+    , _limit(row_limit)
+    , _source(source)
+    , _consumer(std::move(consumer))
+{ }
+
+future<> querying_reader::read() {
+    _reader = _source(_schema, _range, service::get_local_sstable_query_read_priority());
+    return consume(*_reader, [this](mutation&& m) {
+        // FIXME: Make data sources respect row_ranges so that we don't have to filter them out here.
+        auto is_distinct = _slice.options.contains(query::partition_slice::option::distinct);
+        auto is_reversed = _slice.options.contains(query::partition_slice::option::reversed);
+        auto limit = !is_distinct ? _limit : 1;
+        auto rows_left = m.partition().compact_for_query(*m.schema(), _query_time,
+                                                         _slice.row_ranges(*m.schema(), m.key()),
+                                                         is_reversed, limit);
+        _limit -= rows_left;
+
+        if (rows_left || !m.partition().empty()) {
+            // NOTE: We must return all columns, regardless of what's in
+            // partition_slice, for the results to be reconcilable with tombstones.
+            // That's because row's presence depends on existence of any
+            // column in a row (See mutation_partition::query). We could
+            // optimize this case and only send cell timestamps, without data,
+            // for the cells which are not queried for (TODO).
+            _consumer(rows_left, std::move(m));
+        }
+
+        return _limit ? stop_iteration::no : stop_iteration::yes;
+    });
+}
+
+class reconcilable_result_builder {
+    querying_reader _reader;
+    std::vector<partition> _result;
+    uint32_t _total = 0;
+public:
+    reconcilable_result_builder(schema_ptr s,
+        const mutation_source& source,
+        const query::partition_range& range,
+        const query::partition_slice& slice,
+        uint32_t row_limit,
+        gc_clock::time_point query_time)
+            : _reader(std::move(s), source, range, slice, row_limit, query_time, [this] (uint32_t live_rows, mutation&& m) {
+                _result.emplace_back(partition{live_rows, freeze(m)});
+                _total += live_rows;
+            })
+    { }
+
+    reconcilable_result_builder(reconcilable_result_builder&&) = delete; // this captured
+
+    future<reconcilable_result> build() {
+        return _reader.read().then([this] {
+            return make_ready_future<reconcilable_result>(reconcilable_result(_total, std::move(_result)));
+        });
+    }
+};
+
 future<reconcilable_result>
 mutation_query(schema_ptr s,
-    const mutation_source& source,
-    const query::partition_range& range,
-    const query::partition_slice& slice,
-    uint32_t row_limit,
-    gc_clock::time_point query_time)
+               const mutation_source& source,
+               const query::partition_range& range,
+               const query::partition_slice& slice,
+               uint32_t row_limit,
+               gc_clock::time_point query_time)
 {
-    struct query_state {
-        const query::partition_range& range;
-        const query::partition_slice& slice;
-        uint32_t requested_limit;
-        gc_clock::time_point query_time;
-        uint32_t limit;
-        mutation_reader reader;
-        std::vector<partition> result;
-
-        query_state(
-            const query::partition_range& range,
-            const query::partition_slice& slice,
-            uint32_t requested_limit,
-            gc_clock::time_point query_time
-        )
-            : range(range)
-            , slice(slice)
-            , requested_limit(requested_limit)
-            , query_time(query_time)
-            , limit(requested_limit)
-        { }
-    };
-
    if (row_limit == 0) {
        return make_ready_future<reconcilable_result>(reconcilable_result());
    }

-    return do_with(query_state(range, slice, row_limit, query_time),
-                   [&source, s = std::move(s)] (query_state& state) -> future<reconcilable_result> {
-        state.reader = source(std::move(s), state.range, service::get_local_sstable_query_read_priority());
-        return consume(state.reader, [&state] (mutation&& m) {
-            // FIXME: Make data sources respect row_ranges so that we don't have to filter them out here.
-            auto is_distinct = state.slice.options.contains(query::partition_slice::option::distinct);
-            auto is_reversed = state.slice.options.contains(query::partition_slice::option::reversed);
-            auto limit = !is_distinct ? state.limit : 1;
-            auto rows_left = m.partition().compact_for_query(*m.schema(), state.query_time, state.slice.row_ranges(*m.schema(), m.key()),
-                is_reversed, limit);
-            state.limit -= rows_left;
-
-            if (rows_left || !m.partition().empty()) {
-                // NOTE: We must return all columns, regardless of what's in
-                // partition_slice, for the results to be reconcilable with tombstones.
-                // That's because row's presence depends on existence of any
-                // column in a row (See mutation_partition::query). We could
-                // optimize this case and only send cell timestamps, without data,
-                // for the cells which are not queried for (TODO).
-                state.result.emplace_back(partition{rows_left, freeze(m)});
-            }
-
-            return state.limit ? stop_iteration::no : stop_iteration::yes;
-        }).then([&state] {
-            return make_ready_future<reconcilable_result>(
-                reconcilable_result(state.requested_limit - state.limit, std::move(state.result)));
-        });
-    });
+    auto b_ptr = std::make_unique<reconcilable_result_builder>(std::move(s), source, range, slice, row_limit, query_time);
+    auto& b = *b_ptr;
+    return b.build().finally([keep = std::move(b_ptr)] {});
 }

 std::ostream& operator<<(std::ostream& out, const reconcilable_result::printer& pr) {
--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -114,3 +114,26 @@ future<reconcilable_result> mutation_query(
    const query::partition_slice& slice,
    uint32_t row_limit,
    gc_clock::time_point query_time);
+
+
+class querying_reader {
+    schema_ptr _schema;
+    const query::partition_range& _range;
+    const query::partition_slice& _slice;
+    uint32_t _requested_limit;
+    gc_clock::time_point _query_time;
+    uint32_t _limit;
+    const mutation_source& _source;
+    std::function<void(uint32_t, mutation&&)> _consumer;
+    std::experimental::optional<mutation_reader> _reader;
+public:
+    querying_reader(schema_ptr s,
+                    const mutation_source& source,
+                    const query::partition_range& range,
+                    const query::partition_slice& slice,
+                    uint32_t row_limit,
+                    gc_clock::time_point query_time,
+                    std::function<void(uint32_t, mutation&&)> consumer);
+
+    future<> read();
+};
--- a/partition_slice_builder.cc
+++ b/partition_slice_builder.cc
@@ -127,3 +127,15 @@ partition_slice_builder::reversed() {
    _options.set<query::partition_slice::option::reversed>();
    return *this;
 }
+
+partition_slice_builder&
+partition_slice_builder::without_partition_key_columns() {
+    _options.remove<query::partition_slice::option::send_partition_key>();
+    return *this;
+}
+
+partition_slice_builder&
+partition_slice_builder::without_clustering_key_columns() {
+    _options.remove<query::partition_slice::option::send_clustering_key>();
+    return *this;
+}
--- a/partition_slice_builder.hh
+++ b/partition_slice_builder.hh
@@ -50,6 +50,8 @@ public:
    partition_slice_builder& with_regular_column(bytes name);
    partition_slice_builder& with_no_regular_columns();
    partition_slice_builder& with_range(query::clustering_range range);
+    partition_slice_builder& without_partition_key_columns();
+    partition_slice_builder& without_clustering_key_columns();
    partition_slice_builder& reversed();

    query::partition_slice build();
--- a/query-result-set.cc
+++ b/query-result-set.cc
@@ -201,7 +201,7 @@ result_set::from_raw_result(schema_ptr s, const partition_slice& slice, const re

 result_set::result_set(const mutation& m) : result_set([&m] {
    auto slice = partition_slice_builder(*m.schema()).build();
-    auto qr = m.query(slice, result_request::only_result);
+    auto qr = mutation(m).query(slice, result_request::only_result);
    return result_set::from_raw_result(m.schema(), slice, qr);
 }())
 { }
--- a/query-result-set.hh
+++ b/query-result-set.hh
@@ -83,6 +83,7 @@ public:
        }
        throw null_column_value(column_name);
    }
+    const std::unordered_map<sstring, data_value>& cells() const { return _cells; }
    friend inline bool operator==(const result_set_row& x, const result_set_row& y);
    friend inline bool operator!=(const result_set_row& x, const result_set_row& y);
    friend std::ostream& operator<<(std::ostream& out, const result_set_row& row);
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -33,6 +33,7 @@
 #include <boost/algorithm/string/classification.hpp>

 #include <cryptopp/sha.h>
+#include <seastar/core/gate.hh>

 static logging::logger logger("repair");

@@ -326,7 +327,7 @@ static future<partition_checksum> checksum_range_shard(database &db,
        const ::range<dht::token>& range) {
    auto& cf = db.find_column_family(keyspace_name, cf_name);
    return do_with(query::to_partition_range(range), [&cf] (const auto& partition_range) {
-        return do_with(cf.make_reader(cf.schema(), partition_range, service::get_local_mutation_stream_priority()), partition_checksum(),
+        return do_with(cf.make_reader(cf.schema(), partition_range, service::get_local_streaming_read_priority()), partition_checksum(),
            [] (auto& reader, auto& checksum) {
            return repeat([&reader, &checksum] () {
                return reader().then([&checksum] (auto mopt) {
@@ -415,6 +416,21 @@ static void split_and_add(std::vector<::range<dht::token>>& ranges,
    ranges.push_back(halves.first);
    ranges.push_back(halves.second);
 }
+// We don't need to wait for one checksum to finish before we start the
+// next, but doing too many of these operations in parallel also doesn't
+// make sense, so we limit the number of concurrent ongoing checksum
+// requests with a semaphore.
+//
+// FIXME: We shouldn't use a magic number here, but rather bind it to
+// some resource. Otherwise we'll be doing too little in some machines,
+// and too much in others.
+//
+// FIXME: This would be better of in a repair service, or even a per-shard
+// repair instance holding all repair state. However, since we are anyway
+// considering ditching those semaphores for a more fine grained resource-based
+// solution, let's do the simplest thing here and change it later
+constexpr int parallelism = 100;
+static thread_local semaphore parallelism_semaphore(parallelism);

 // Repair a single cf in a single local range.
 // Comparable to RepairJob in Origin.
@@ -461,21 +477,14 @@ static future<> repair_cf_range(seastar::sharded<database>& db,
        split_and_add(ranges, range, estimated_partitions, 100);
    }

-    // We don't need to wait for one checksum to finish before we start the
-    // next, but doing too many of these operations in parallel also doesn't
-    // make sense, so we limit the number of concurrent ongoing checksum
-    // requests with a semaphore.
-    //
-    // FIXME: We shouldn't use a magic number here, but rather bind it to
-    // some resource. Otherwise we'll be doing too little in some machines,
-    // and too much in others.
-    constexpr int parallelism = 10;
-    return do_with(semaphore(parallelism), true, std::move(keyspace), std::move(cf), std::move(ranges),
-        [&db, &neighbors, parallelism] (auto& sem, auto& success, const auto& keyspace, const auto& cf, const auto& ranges) {
-        return do_for_each(ranges, [&sem, &success, &db, &neighbors, &keyspace, &cf]
+    return do_with(seastar::gate(), true, std::move(keyspace), std::move(cf), std::move(ranges),
+        [&db, &neighbors] (auto& completion, auto& success, const auto& keyspace, const auto& cf, const auto& ranges) {
+        return do_for_each(ranges, [&completion, &success, &db, &neighbors, &keyspace, &cf]
                           (const auto& range) {
+
            check_in_shutdown();
-            return sem.wait(1).then([&sem, &success, &db, &neighbors, &keyspace, &cf, &range] {
+            return parallelism_semaphore.wait(1).then([&completion, &success, &db, &neighbors, &keyspace, &cf, &range] {
+
                // Ask this node, and all neighbors, to calculate checksums in
                // this range. When all are done, compare the results, and if
                // there are any differences, sync the content of this range.
@@ -487,6 +496,8 @@ static future<> repair_cf_range(seastar::sharded<database>& db,
                            net::get_local_messaging_service().send_repair_checksum_range(
                                    net::msg_addr{neighbor},keyspace, cf, range));
                }
+
+                completion.enter();
                when_all(checksums.begin(), checksums.end()).then(
                        [&db, &keyspace, &cf, &range, &neighbors, &success]
                        (std::vector<future<partition_checksum>> checksums) {
@@ -532,10 +543,13 @@ static future<> repair_cf_range(seastar::sharded<database>& db,
                    // tell the caller.
                    success = false;
                    logger.warn("Failed sync of range {}: {}", range, eptr);
-                }).finally([&sem] { sem.signal(1); });
+                }).finally([&completion] {
+                    parallelism_semaphore.signal(1);
+                    completion.leave(); // notify do_for_each that we're done
+                });
            });
-        }).finally([&sem, &success, parallelism] {
-            return sem.wait(parallelism).then([&success] {
+        }).finally([&success, &completion] {
+            return completion.close().then([&success] {
                return success ? make_ready_future<>() :
                        make_exception_future<>(std::runtime_error("Checksum or sync of partial range failed"));
            });
--- a/reversibly_mergeable.hh
+++ b/reversibly_mergeable.hh
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2016 Cloudius Systems, Ltd.
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "utils/allocation_strategy.hh"
+#include <seastar/util/defer.hh>
+
+//
+// ~~ Definitions ~~
+//
+// Mergeable type is a type which has an associated "apply" binary operation (T x T -> T)
+// which forms a commutative semigroup with instances of that type.
+//
+// ReversiblyMergeable type is a Mergeable type which has two binary operations associated,
+// "apply_reversibly" and "revert", both working on objects of that type (T x T -> T x T)
+// with the following properties:
+//
+//   apply_reversibly(x, y) = (x', y')
+//   revert(x', y') = (x'', y'')
+//
+//   x'  = apply(x, y)
+//   x'' = x
+//   apply(x'', y'') = apply(x, y)
+//
+// Note that it is not guaranteed that y'' = y and the state of y' is unspecified.
+//
+// ~~ API ~~
+//
+// "apply_reversibly" and "revert" are usually implemented as instance methods or functions
+// mutating both arguments to store the result of the operation in them.
+//
+// "revert" is not allowed to throw. If "apply_reversibly" throws the objects on which it operates
+// are left in valid states, with guarantees the same as if a successful apply_reversibly() was
+// followed by revert().
+//
+
+
+template<typename T>
+struct default_reversible_applier {
+    void operator()(T& dst, T& src) const {
+        dst.apply_reversibly(src);
+    }
+};
+
+template<typename T>
+struct default_reverter {
+    void operator()(T& dst, T& src) const noexcept {
+        dst.revert(src);
+    }
+};
--- a/scripts/scylla_install_pkg
+++ b/scripts/scylla_install_pkg
@@ -36,14 +36,29 @@ done
 . /etc/os-release

 if [ "$ID" = "ubuntu" ]; then
-    if [ "$LOCAL_PKG" = "" ]; then
+    echo "#!/bin/sh" >> /usr/sbin/policy-rc.d
+    echo "exit 101" >> /usr/sbin/policy-rc.d
+    chmod +x /usr/sbin/policy-rc.d
+    cp /etc/hosts /etc/hosts.orig
+    echo 127.0.0.1 `hostname` >> /etc/hosts
+    if [ $UNSTABLE -eq 0 ]; then
        echo "deb http://s3.amazonaws.com/downloads.scylladb.com/deb/ubuntu trusty/scylladb multiverse" > /etc/apt/sources.list.d/scylla.list
-        apt-get update
+    else
+        echo "deb https://s3.amazonaws.com/downloads.scylladb.com/deb/unstable/ubuntu/master/latest trusty/scylladb multiverse" > /etc/apt/sources.list.d/scylla.list
+    fi
+    apt-get update
+    if [ "$LOCAL_PKG" = "" ]; then
        apt-get install -y --force-yes scylla-server scylla-jmx scylla-tools
    else
-        apt-get install -y --force-yes gdebi-core
-        gdebi $LOCAL_PKG/scylla-server*.deb $LOCAL_PKG/scylla-jmx*.deb $LOCAL_PKG/scylla-tools*.deb
+        if [ ! -f /usr/bin/gdebi ]; then
+            apt-get install -y --force-yes gdebi-core
+        fi
+        echo Y | gdebi $LOCAL_PKG/scylla-server*.deb
+        echo Y | gdebi $LOCAL_PKG/scylla-jmx*.deb
+        echo Y | gdebi $LOCAL_PKG/scylla-tools*.deb
    fi
+    mv /etc/hosts.orig /etc/hosts
+    rm /usr/sbin/policy-rc.d
 else
    if [ "$ID" = "fedora" ]; then
        if [ $UNSTABLE -eq 0 ]; then
--- a/2
+++ b/2
--- a/service/priority_manager.hh
+++ b/service/priority_manager.hh
@@ -26,7 +26,8 @@ namespace service {
 class priority_manager {
    ::io_priority_class _commitlog_priority;
    ::io_priority_class _mt_flush_priority;
-    ::io_priority_class _mut_stream_priority;
+    ::io_priority_class _stream_read_priority;
+    ::io_priority_class _stream_write_priority;
    ::io_priority_class _sstable_query_read;
    ::io_priority_class _compaction_priority;

@@ -42,8 +43,13 @@ public:
    }

    const ::io_priority_class&
-    mutation_stream_priority() {
-        return _mut_stream_priority;
+    streaming_read_priority() {
+        return _stream_read_priority;
+    }
+
+    const ::io_priority_class&
+    streaming_write_priority() {
+        return _stream_write_priority;
    }

    const ::io_priority_class&
@@ -59,7 +65,8 @@ public:
    priority_manager()
        : _commitlog_priority(engine().register_one_priority_class("commitlog", 100))
        , _mt_flush_priority(engine().register_one_priority_class("memtable_flush", 100))
-        , _mut_stream_priority(engine().register_one_priority_class("streaming", 100))
+        , _stream_read_priority(engine().register_one_priority_class("streaming_read", 20))
+        , _stream_write_priority(engine().register_one_priority_class("streaming_write", 20))
        , _sstable_query_read(engine().register_one_priority_class("query", 100))
        , _compaction_priority(engine().register_one_priority_class("compaction", 100))

@@ -78,8 +85,13 @@ get_local_memtable_flush_priority() {
 }

 const inline ::io_priority_class&
-get_local_mutation_stream_priority() {
-    return get_local_priority_manager().mutation_stream_priority();
+get_local_streaming_read_priority() {
+    return get_local_priority_manager().streaming_read_priority();
+}
+
+const inline ::io_priority_class&
+get_local_streaming_write_priority() {
+    return get_local_priority_manager().streaming_write_priority();
 }

 const inline ::io_priority_class&
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -835,6 +835,15 @@ storage_proxy::mutate_locally(std::vector<mutation> mutations) {
    });
 }

+future<>
+storage_proxy::mutate_streaming_mutation(const schema_ptr& s, const frozen_mutation& m) {
+    auto shard = _db.local().shard_of(m);
+    return _db.invoke_on(shard, [&m, gs = global_schema_ptr(s)] (database& db) mutable -> future<> {
+        return db.apply_streaming_mutation(gs, m);
+    });
+}
+
+
 /**
 * Helper for create_write_response_handler, shared across mutate/mutate_atomically.
 * Both methods do roughly the same thing, with the latter intermixing batch log ops
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -181,6 +181,8 @@ public:
    future<> mutate_locally(const schema_ptr&, const frozen_mutation& m);
    future<> mutate_locally(std::vector<mutation> mutations);

+    future<> mutate_streaming_mutation(const schema_ptr&, const frozen_mutation& m);
+
    /**
    * Use this method to have these Mutations applied
    * across all replicas. This method will take care
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -749,14 +749,14 @@ void storage_service::on_join(gms::inet_address endpoint, gms::endpoint_state ep
        on_change(endpoint, e.first, e.second);
    }
    get_local_migration_manager().schedule_schema_pull(endpoint, ep_state).handle_exception([endpoint] (auto ep) {
-        logger.warn("Fail to pull schmea from {}: {}", endpoint, ep);
+        logger.warn("Fail to pull schema from {}: {}", endpoint, ep);
    });
 }

 void storage_service::on_alive(gms::inet_address endpoint, gms::endpoint_state state) {
    logger.debug("endpoint={} on_alive", endpoint);
    get_local_migration_manager().schedule_schema_pull(endpoint, state).handle_exception([endpoint] (auto ep) {
-        logger.warn("Fail to pull schmea from {}: {}", endpoint, ep);
+        logger.warn("Fail to pull schema from {}: {}", endpoint, ep);
    });
    if (_token_metadata.is_member(endpoint)) {
 #if 0
@@ -813,7 +813,7 @@ void storage_service::on_change(inet_address endpoint, application_state state,
            do_update_system_peers_table(endpoint, state, value);
            if (state == application_state::SCHEMA) {
                get_local_migration_manager().schedule_schema_pull(endpoint, *ep_state).handle_exception([endpoint] (auto ep) {
-                    logger.warn("Fail to pull schmea from {}: {}", endpoint, ep);
+                    logger.warn("Failed to pull schema from {}: {}", endpoint, ep);
                });
            }
        }
@@ -2481,7 +2481,7 @@ void storage_service::add_expire_time_if_found(inet_address endpoint, int64_t ex
 // in there.
 future<> storage_service::load_new_sstables(sstring ks_name, sstring cf_name) {
    class max_element {
-        int64_t _result = 1;
+        int64_t _result = 0;
    public:
        future<> operator()(int64_t value) {
            _result = std::max(value, _result);
@@ -2514,18 +2514,37 @@ future<> storage_service::load_new_sstables(sstring ks_name, sstring cf_name) {
        auto& cf = db.find_column_family(ks_name, cf_name);
        return cf.disable_sstable_write();
    }).then([this, cf_name, ks_name] (int64_t max_seen_sstable) {
-        logger.debug("Loading new sstables with generation numbers larger or equal than {}", max_seen_sstable);
        // Then, we will reshuffle the tables to make sure that the generation numbers don't go too high.
        // We will do all of it the same CPU, to make sure that we won't have two parallel shufflers stepping
        // onto each other.
-        //
-        // Note that this will reshuffle all tables, including existing ones. Figuring out which of the tables
-        // are new would require coordination between all shards, so it is simpler this way. Renaming an existing
-        // SSTable shouldn't be that bad, and we are assuming empty directory for normal operation anyway.
-        auto shard = std::hash<sstring>()(cf_name) % smp::count;
-        return _db.invoke_on(shard, [ks_name, cf_name, max_seen_sstable] (database& db) {
+
+        class all_generations {
+            std::set<int64_t> _result;
+        public:
+            future<> operator()(std::set<int64_t> value) {
+                _result.insert(value.begin(), value.end());
+                return make_ready_future<>();
+            }
+            std::set<int64_t> get() && {
+                return _result;
+            }
+        };
+
+        // We provide to reshuffle_sstables() the generation of all existing sstables, such that it will
+        // easily know which sstables are new.
+        return _db.map_reduce(all_generations(), [ks_name, cf_name] (database& db) {
            auto& cf = db.find_column_family(ks_name, cf_name);
-            return cf.reshuffle_sstables(max_seen_sstable);
+            std::set<int64_t> generations;
+            for (auto& p : *(cf.get_sstables())) {
+                generations.insert(p.second->generation());
+            }
+            return make_ready_future<std::set<int64_t>>(std::move(generations));
+        }).then([this, max_seen_sstable, ks_name, cf_name] (std::set<int64_t> all_generations) {
+            auto shard = std::hash<sstring>()(cf_name) % smp::count;
+            return _db.invoke_on(shard, [ks_name, cf_name, max_seen_sstable, all_generations = std::move(all_generations)] (database& db) {
+                auto& cf = db.find_column_family(ks_name, cf_name);
+                return cf.reshuffle_sstables(std::move(all_generations), max_seen_sstable + 1);
+            });
        });
    }).then_wrapped([this, ks_name, cf_name] (future<std::vector<sstables::entry_descriptor>> f) {
        std::vector<sstables::entry_descriptor> new_tables;
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -139,7 +139,7 @@ compact_sstables(std::vector<shared_sstable> sstables, column_family& cf, std::f

    db::replay_position rp;

-    auto all_sstables = cf.get_sstables();
+    auto all_sstables = cf.get_sstables_including_compacted_undeleted();
    std::sort(sstables.begin(), sstables.end(), [] (const shared_sstable& x, const shared_sstable& y) {
        return x->generation() < y->generation();
    });
--- a/sstables/compress.cc
+++ b/sstables/compress.cc
@@ -229,9 +229,14 @@ public:
            : _compression_metadata(cm)
    {
        _beg_pos = pos;
-        if (pos >= _compression_metadata->data_len) {
+        if (pos > _compression_metadata->data_len) {
            throw std::runtime_error("attempt to uncompress beyond end");
        }
+        if (len == 0 || pos == _compression_metadata->data_len) {
+            // Nothing to read
+            _end_pos = _pos = _beg_pos;
+            return;
+        }
        if (len <= _compression_metadata->data_len - pos) {
            _end_pos = pos + len;
        } else {
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -42,10 +42,16 @@ public:
    }
 };

-class index_consume_entry_context: public data_consumer::continuous_data_consumer<index_consume_entry_context> {
+// IndexConsumer is a concept that implements:
+//
+// bool should_continue();
+// void consume_entry(index_entry&& ie);
+template <class IndexConsumer>
+class index_consume_entry_context: public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
    using proceed = data_consumer::proceed;
+    using continuous_data_consumer = data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>>;
 private:
-    index_consumer& _consumer;
+    IndexConsumer& _consumer;

    enum class state {
        START,
@@ -66,7 +72,7 @@ public:

    bool non_consuming() const {
        return ((_state == state::CONSUME_ENTRY) || (_state == state::START) ||
-                ((_state == state::PROMOTED_BYTES) && (_prestate == prestate::NONE)));
+                ((_state == state::PROMOTED_BYTES) && (continuous_data_consumer::_prestate == continuous_data_consumer::prestate::NONE)));
    }

    proceed process_state(temporary_buffer<char>& data) {
@@ -79,32 +85,32 @@ public:
            _state = state::KEY_SIZE;
            break;
        case state::KEY_SIZE:
-            if (read_16(data) != read_status::ready) {
+            if (this->read_16(data) != continuous_data_consumer::read_status::ready) {
                _state = state::KEY_BYTES;
                break;
            }
        case state::KEY_BYTES:
-            if (read_bytes(data, _u16, _key) != read_status::ready) {
+            if (this->read_bytes(data, this->_u16, _key) != continuous_data_consumer::read_status::ready) {
                _state = state::POSITION;
                break;
            }
        case state::POSITION:
-            if (read_64(data) != read_status::ready) {
+            if (this->read_64(data) != continuous_data_consumer::read_status::ready) {
                _state = state::PROMOTED_SIZE;
                break;
            }
        case state::PROMOTED_SIZE:
-            if (read_32(data) != read_status::ready) {
+            if (this->read_32(data) != continuous_data_consumer::read_status::ready) {
                _state = state::PROMOTED_BYTES;
                break;
            }
        case state::PROMOTED_BYTES:
-            if (read_bytes(data, _u32, _promoted) != read_status::ready) {
+            if (this->read_bytes(data, this->_u32, _promoted) != continuous_data_consumer::read_status::ready) {
                _state = state::CONSUME_ENTRY;
                break;
            }
        case state::CONSUME_ENTRY:
-            _consumer.consume_entry(index_entry(std::move(_key), _u64, std::move(_promoted)));
+            _consumer.consume_entry(index_entry(std::move(_key), this->_u64, std::move(_promoted)));
            _state = state::START;
            break;
        default:
@@ -113,7 +119,7 @@ public:
        return proceed::yes;
    }

-    index_consume_entry_context(index_consumer& consumer,
+    index_consume_entry_context(IndexConsumer& consumer,
            input_stream<char>&& input, uint64_t maxlen)
        : continuous_data_consumer(std::move(input), maxlen)
        , _consumer(consumer)
--- a/sstables/key.hh
+++ b/sstables/key.hh
@@ -57,21 +57,18 @@ enum class composite_marker : bytes::value_type {
    end_range = 1,
 };

-inline void check_marker(bytes_view component, composite_marker expected) {
+inline void check_marker(bytes_view component) {
    auto found = composite_marker(component.back());
-    if (found != expected) {
-        throw runtime_exception(sprint("Unexpected marker. Found %d, expected %d\n", uint8_t(found), uint8_t(expected)));
+    switch (found) {
+    case composite_marker::none:
+    case composite_marker::start_range:
+    case composite_marker::end_range:
+        break;
+    default:
+        throw runtime_exception(sprint("Unexpected marker. Found %d, expected %d\n", uint16_t(uint8_t(found))));
    }
 }

-inline void check_marker(bytes_view component, composite_marker expected, composite_marker alternative) {
-    auto found = composite_marker(component.back());
-    if ((found == expected) || (found == alternative)) {
-        return;
-    }
-    throw runtime_exception(sprint("Unexpected marker. Found %d, expected %d or %d\n", uint8_t(found), uint8_t(expected)));
-}
-
 // Our internal representation differs slightly (in the way it serializes) from Origin.
 // In order to be able to achieve read and write compatibility for sstables - so they can
 // be imported and exported - we need to always convert a key to this representation.
--- a/sstables/partition.cc
+++ b/sstables/partition.cc
@@ -249,6 +249,139 @@ class mp_row_consumer : public row_consumer {
            _pending_collection = {};
        }
    }
+
+    class range_merger {
+        bytes _data;
+        bytes _end;
+        sstables::deletion_time _deletion_time;
+    public:
+        bytes&& data() {
+            return std::move(_data);
+        }
+        explicit operator bool() const noexcept {
+            return !_data.empty();
+        }
+        explicit operator sstring() const {
+            if (*this) {
+                return to_hex(_data) + sprint(" deletion (%x,%lx)", _deletion_time.local_deletion_time, _deletion_time.marked_for_delete_at);
+            } else {
+                return sstring("(null)");
+            }
+        }
+        explicit operator bytes_view() const {
+            return _data;
+        }
+
+        bool operator==(const range_merger& candidate) {
+            if (!candidate) {
+                return false;
+            }
+            bytes_view a(_data);
+            bytes_view b(candidate._data);
+            a.remove_suffix(1);
+            b.remove_suffix(1);
+            return ((a == b) && (_deletion_time == candidate._deletion_time));
+        }
+
+        bool operator!=(const range_merger& candidate) {
+            return !(*this == candidate);
+        }
+
+        bool is_prefix_of(const range_merger& candidate) {
+            bytes_view a(_data);
+            bytes_view b(candidate._data);
+            a.remove_suffix(1);
+            b.remove_suffix(1);
+            return b.compare(0, a.size(), a) == 0;
+        }
+
+        bool end_matches(bytes_view candidate, sstables::deletion_time deltime) {
+            if (_deletion_time != deltime) {
+                return false;
+            }
+            bytes_view my_end(_end);
+            my_end.remove_suffix(1);
+            candidate.remove_suffix(1);
+            return my_end == candidate;
+        }
+
+        void set_end(bytes_view end) {
+            _end = to_bytes(end);
+        }
+
+        range_merger(bytes_view start, bytes_view end, sstables::deletion_time d)
+            : _data(to_bytes(start))
+            , _end(to_bytes(end))
+            , _deletion_time(d)
+        {}
+        range_merger() : _data(), _end(), _deletion_time() {}
+    };
+
+    // Variables for tracking tombstone merging in consume_range_tombstone().
+    // All of these hold serialized composites.
+    std::stack<range_merger> _starts;
+
+    void reset_range_tombstone_merger() {
+        // Will throw if there is a current merger that hasn't finished.
+        // This will be called at the start and end of any row.
+        // This check is crucial to our goal of not falsely reporting a real range tombstone as a
+        // merger.
+        if (!_starts.empty()) {
+            auto msg = sstring("RANGE DELETE not implemented. Tried to merge, but row finished before we could finish the merge. Starts found: (");
+            while (!_starts.empty()) {
+                msg += sstring(_starts.top());
+                _starts.pop();
+                if (!_starts.empty()) {
+                    msg += sstring(" , ");
+                }
+            }
+            msg += sstring(")");
+            throw malformed_sstable_exception(msg);
+        }
+    }
+
+    bytes close_merger_range() {
+        // We closed a larger enclosing row.
+        auto ret = _starts.top().data();
+        _starts.pop();
+        return ret;
+    }
+
+    bytes update_range_tombstone_merger(bytes_view _start, bytes_view end,
+                                        sstables::deletion_time deltime) {
+        range_merger start(_start, end, deltime);
+        range_merger empty;
+
+        // If we're processing a range (_starts is not empty, it's fine to start
+        // processing another, but only so long as we're nesting. We then check
+        // to make sure that the current range being processed is a prefix of the new one.
+        if (!_starts.empty() && !_starts.top().is_prefix_of(start)) {
+            auto msg = sstring("RANGE DELETE not implemented. Tried to merge, but existing range not a prefix of new one. Current range: ");
+            msg += sstring(_starts.top());
+            msg += ". new range: " + sstring(start);
+            throw malformed_sstable_exception(msg);
+        }
+
+        range_merger& prev = empty;
+        if (!_starts.empty()) {
+            prev = _starts.top();
+        }
+        _starts.push(start);
+
+        if (prev.end_matches(bytes_view(start), deltime)) {
+            // If _contig_deletion_end, we're in the middle of trying to merge
+            // several contiguous range tombstones. If there's a gap, we cannot
+            // represent this range in Scylla.
+            prev.set_end(end);
+            // We pop what we have just inserted, because that's not starting the
+            // processing of any new range.
+            _starts.pop();
+        }
+        if (_starts.top().end_matches(end, deltime)) {
+            return close_merger_range();
+        }
+        return {};
+    }
 public:
    mutation_opt mut;

@@ -366,39 +499,77 @@ public:
        }
    }
    virtual proceed consume_row_end() override {
+        reset_range_tombstone_merger();
        if (mut) {
            flush_pending_collection(*_schema, *mut);
        }
        return proceed::no;
    }

+    // Partial support for range tombstones read from sstables:
+    //
+    // Currently, Scylla does not support generic range tombstones: Only
+    // ranges which are a complete clustering-key prefix are supported because
+    // our in-memory data structure only allows deleted rows (prefixes).
+    // In principle, this is good enough because in Cassandra 2 (whose
+    // sstables we support) and using only CQL, there is no way to delete a
+    // generic range, because the DELETE and UPDATE statement's "WHERE" only
+    // takes the "=" operator, leading to a deletion of entire rows.
+    //
+    // However, in one important case the sstable written by Cassandra does
+    // have a generic range tombstone, which we can and must handle:
+    // Consider two tombstones, one deleting a bigger prefix than the other:
+    //
+    //     create table tab (pk text, ck1 text, ck2 text, data text, primary key(pk, ck1, ck2));
+    //     delete from tab where pk = 'pk' and ck1 = 'aaa';
+    //     delete from tab where pk = 'pk' and ck1 = 'aaa' and ck2 = 'bbb';
+    //
+    // The first deletion covers the second, but nevertheless we cannot drop the
+    // smaller one because the two deletions have different timestamps.
+    // Currently in Scylla, we simply keep both tombstones separately.
+    // But Cassandra does something different: Cassandra does not want to have
+    // overlapping range tombstones, so it converts them into non-overlapping
+    // range tombstones (see RangeTombstoneList.java). In the above example,
+    // the resulting sstable is (sstable2json format)
+    //
+    //     {"key": "pk",
+    //      "cells": [["aaa:_","aaa:bbb:_",1459334681228103,"t",1459334681],
+    //                ["aaa:bbb:_","aaa:bbb:!",1459334681244989,"t",1459334681],
+    //                ["aaa:bbb:!","aaa:!",1459334681228103,"t",1459334681]]}
+    //               ]
+    //
+    // In this sstable, the first and third tombstones look like "generic" ranges,
+    // not covering an entire prefix, so we cannot represent these three
+    // tombstones in our in-memory data structure. Instead, we need to convert the
+    // three non-overlapping tombstones to two overlapping whole-prefix tombstones,
+    // the two we started with in the "delete" commands above.
+    // This is what the code below does. If after trying to recombine split
+    // tombstones we are still left with a generic range we cannot represent,
+    // we fail the read.
+
    virtual void consume_range_tombstone(
            bytes_view start_col, bytes_view end_col,
            sstables::deletion_time deltime) override {
-        check_marker(end_col, composite_marker::end_range);
-        // Some versions of Cassandra will write a 0 to mark the start of the range.
-        // CASSANDRA-7593 discusses that.
-        check_marker(start_col, composite_marker::start_range, composite_marker::none);
+        // We used to check that start_col has composite_marker:start_range
+        // and end_col has composite_marker::end_range. But this check is
+        // incorrect. start_col may have composite_marker::none in sstables
+        // from older versions of Cassandra (see CASSANDRA-7593) and we also
+        // saw composite_marker::none in end_col. Also, when a larger range
+        // tombstone was split (see explanation above), we can have a
+        // start_range in end_col or end_range in start_col.
+        // So we don't check the markers' content at all here, only if they
+        // are sane.
+        check_marker(start_col);
+        check_marker(end_col);

-        // FIXME: CASSANDRA-6237 says support will be added to things like this.
-        //
-        // The check below represents a range with a different start and end
-        // clustering key.  Cassandra-generated files (to the moment) will
-        // generate multi-row deletes, but they always have the same clustering
-        // key. This is basically because one can't (yet) write delete
-        // statements in which the WHERE clause looks like WHERE clustering_key >= x.
-        //
-        // We don't really have it in our model ATM, so let's just mark this unimplemented.
-        //
-        // The only expected difference between them, is the final marker. We
-        // will remove it from end_col to ease the comparison, but will leave
-        // start_col untouched to make sure explode() still works.
-        end_col.remove_suffix(1);
-        if (start_col.compare(0, end_col.size(), end_col)) {
-            fail(unimplemented::cause::RANGE_DELETES);
+        bytes new_start = {};
+        new_start = update_range_tombstone_merger(start_col, end_col, deltime);
+        if (new_start.empty()) {
+            return;
        }
-
+        start_col = bytes_view(new_start);
        auto start = composite_view(column::fix_static_name(start_col)).explode();
+
        // Note how this is slightly different from the check in is_collection. Collection tombstones
        // do not have extra data.
        //
@@ -466,6 +637,13 @@ sstables::sstable::read_row(schema_ptr schema, const sstables::key& key, const i
    auto token = partitioner.get_token(key_view(key));

    auto& summary = _summary;
+
+    if (token < partitioner.get_token(key_view(summary.first_key.value))
+            || token > partitioner.get_token(key_view(summary.last_key.value))) {
+        _filter_tracker.add_false_positive();
+        return make_ready_future<mutation_opt>();
+    }
+
    auto summary_idx = adjust_binary_search_index(binary_search(summary.entries, key, token));
    if (summary_idx < 0) {
        _filter_tracker.add_false_positive();
@@ -495,52 +673,59 @@ class mutation_reader::impl {
 private:
    mp_row_consumer _consumer;
    std::experimental::optional<data_consume_context> _context;
-    std::experimental::optional<future<data_consume_context>> _context_future;
+    std::function<future<data_consume_context> ()> _get_context;
 public:
    impl(sstable& sst, schema_ptr schema, uint64_t start, uint64_t end,
         const io_priority_class &pc)
        : _consumer(schema, pc)
-        , _context(sst.data_consume_rows(_consumer, start, end)) { }
+        , _get_context([&sst, this, start, end] {
+            return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer, start, end));
+        }) { }
    impl(sstable& sst, schema_ptr schema,
         const io_priority_class &pc)
        : _consumer(schema, pc)
-        , _context(sst.data_consume_rows(_consumer)) { }
-    impl(sstable& sst, schema_ptr schema, future<uint64_t> start, future<uint64_t> end, const io_priority_class& pc)
+        , _get_context([this, &sst] {
+            return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer));
+        }) { }
+    impl(sstable& sst, schema_ptr schema, std::function<future<uint64_t>()> start, std::function<future<uint64_t>()> end, const io_priority_class& pc)
        : _consumer(schema, pc)
-        , _context_future(start.then([this, &sst, end = std::move(end)] (uint64_t start) mutable {
-                      return end.then([this, &sst, start] (uint64_t end) mutable {
-                          return sst.data_consume_rows(_consumer, start, end);
-                      });
-                    })) { }
-    impl() : _consumer() { }
+        , _get_context([this, &sst, start = std::move(start), end = std::move(end)] () {
+            return start().then([this, &sst, end = std::move(end)] (uint64_t start) {
+                return end().then([this, &sst, start] (uint64_t end) {
+                    return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer, start, end));
+                });
+            });
+        }) { }
+    impl() : _consumer(), _get_context() { }

    // Reference to _consumer is passed to data_consume_rows() in the constructor so we must not allow move/copy
    impl(impl&&) = delete;
    impl(const impl&) = delete;

    future<mutation_opt> read() {
-        if (_context) {
-            return _context->read().then([this] {
-                // We want after returning a mutation that _consumer.mut()
-                // will be left in unengaged state (so on EOF we return an
-                // unengaged optional). Moving _consumer.mut is *not* enough.
-                auto ret = std::move(_consumer.mut);
-                _consumer.mut = {};
-                return std::move(ret);
-            });
-        } else if (_context_future) {
-            return _context_future->then([this] (auto context) {
-                _context = std::move(context);
-                return _context->read().then([this] {
-                    auto ret = std::move(_consumer.mut);
-                    _consumer.mut = {};
-                    return std::move(ret);
-                });
-            });
-        } else {
+        if (!_get_context) {
            // empty mutation reader returns EOF immediately
            return make_ready_future<mutation_opt>();
        }
+
+        if (_context) {
+            return do_read();
+        }
+        return (_get_context)().then([this] (data_consume_context context) {
+            _context = std::move(context);
+            return do_read();
+        });
+    }
+private:
+    future<mutation_opt> do_read() {
+        return _context->read().then([this] {
+            // We want after returning a mutation that _consumer.mut()
+            // will be left in unengaged state (so on EOF we return an
+            // unengaged optional). Moving _consumer.mut is *not* enough.
+            auto ret = std::move(_consumer.mut);
+            _consumer.mut = {};
+            return std::move(ret);
+        });
    }
 };

@@ -649,17 +834,19 @@ sstable::read_range_rows(schema_ptr schema, const query::partition_range& range,
        fail(unimplemented::cause::WRAP_AROUND);
    }

-    future<uint64_t> start = range.start()
-        ? (range.start()->is_inclusive()
+    auto start = [this, range, schema, &pc] {
+        return range.start() ? (range.start()->is_inclusive()
                 ? lower_bound(schema, range.start()->value(), pc)
                 : upper_bound(schema, range.start()->value(), pc))
        : make_ready_future<uint64_t>(0);
+    };

-    future<uint64_t> end = range.end()
-        ? (range.end()->is_inclusive()
+    auto end = [this, range, schema, &pc] {
+        return range.end() ? (range.end()->is_inclusive()
                 ? upper_bound(schema, range.end()->value(), pc)
                 : lower_bound(schema, range.end()->value(), pc))
        : make_ready_future<uint64_t>(data_size());
+    };

    return std::make_unique<mutation_reader::impl>(
        *this, std::move(schema), std::move(start), std::move(end), pc);
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -30,6 +30,7 @@
 #include "core/shared_ptr.hh"
 #include "core/do_with.hh"
 #include "core/thread.hh"
+#include <seastar/core/shared_future.hh>
 #include <iterator>

 #include "types.hh"
@@ -44,6 +45,9 @@
 #include <boost/filesystem/operations.hpp>
 #include <boost/algorithm/string.hpp>
 #include <boost/range/adaptor/map.hpp>
+#include <boost/range/algorithm_ext/insert.hpp>
+#include <boost/range/algorithm_ext/push_back.hpp>
+#include <boost/range/algorithm/set_algorithm.hpp>
 #include <regex>
 #include <core/align.hh>
 #include "utils/phased_barrier.hh"
@@ -59,7 +63,12 @@ future<file> new_sstable_component_file(sstring name, open_flags flags) {
    });
 }

-thread_local std::unordered_map<sstring, std::unordered_set<unsigned>> sstable::_shards_agreeing_to_remove_sstable;
+future<file> new_sstable_component_file(sstring name, open_flags flags, file_open_options options) {
+    return open_file_dma(name, flags, options).handle_exception([name] (auto ep) {
+        sstlog.error("Could not create SSTable component {}. Found exception: {}", name, ep);
+        return make_exception_future<file>(ep);
+    });
+}

 static utils::phased_barrier& background_jobs() {
    static thread_local utils::phased_barrier gate;
@@ -682,6 +691,10 @@ inline void write(file_writer& out, estimated_histogram& eh) {
 // This is small enough, and well-defined. Easier to just read it all
 // at once
 future<> sstable::read_toc() {
+    if (_components.size()) {
+        return make_ready_future<>();
+    }
+
    auto file_path = filename(sstable::component_type::TOC);

    sstlog.debug("Reading TOC file {} ", file_path);
@@ -712,6 +725,7 @@ future<> sstable::read_toc() {
                try {
                   _components.insert(reverse_map(c, _component_map));
                } catch (std::out_of_range& oor) {
+                    _components.clear(); // so subsequent read_toc will be forced to fail again
                    throw malformed_sstable_exception("Unrecognized TOC component: " + c);
                }
            }
@@ -862,7 +876,7 @@ future<index_list> sstable::read_indexes(uint64_t summary_idx, const io_priority
        auto stream = make_file_input_stream(this->_index_file, position, end - position, std::move(options));
        // TODO: it's redundant to constrain the consumer here to stop at
        // index_size()-position, the input stream is already constrained.
-        auto ctx = make_lw_shared<index_consume_entry_context>(ic, std::move(stream), this->index_size() - position);
+        auto ctx = make_lw_shared<index_consume_entry_context<index_consumer>>(ic, std::move(stream), this->index_size() - position);
        return ctx->consume_input(*ctx).then([ctx, &ic] {
            return make_ready_future<index_list>(std::move(ic.indexes));
        });
@@ -934,6 +948,25 @@ void sstable::write_statistics(const io_priority_class& pc) {
    write_simple<component_type::Statistics>(_statistics, pc);
 }

+future<> sstable::read_summary(const io_priority_class& pc) {
+    if (_summary) {
+        return make_ready_future<>();
+    }
+
+    return read_toc().then([this, &pc] {
+        // We'll try to keep the main code path exception free, but if an exception does happen
+        // we can try to regenerate the Summary.
+        if (has_component(sstable::component_type::Summary)) {
+            return read_simple<component_type::Summary>(_summary, pc).handle_exception([this, &pc] (auto ep) {
+                sstlog.warn("Couldn't read summary file %s: %s. Recreating it.", this->filename(component_type::Summary), ep);
+                return this->generate_summary(pc);
+            });
+        } else {
+            return generate_summary(pc);
+        }
+    });
+}
+
 future<> sstable::open_data() {
    return when_all(open_file_dma(filename(component_type::Index), open_flags::ro),
                    open_file_dma(filename(component_type::Data), open_flags::ro)).then([this] (auto files) {
@@ -964,8 +997,10 @@ future<> sstable::open_data() {

 future<> sstable::create_data() {
    auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
+    file_open_options opt;
+    opt.extent_allocation_size_hint = 32 << 20;
    return when_all(new_sstable_component_file(filename(component_type::Index), oflags),
-                    new_sstable_component_file(filename(component_type::Data), oflags)).then([this] (auto files) {
+                    new_sstable_component_file(filename(component_type::Data), oflags, opt)).then([this] (auto files) {
        // FIXME: If both files could not be created, the first get below will
        // throw an exception, and second get() will not be attempted, and
        // we'll get a warning about the second future being destructed
@@ -1202,10 +1237,9 @@ static void write_index_entry(file_writer& out, disk_string_view<uint16_t>& key,
    write(out, key, pos, promoted_index_size);
 }

-static void prepare_summary(summary& s, uint64_t expected_partition_count, const schema& schema) {
+static void prepare_summary(summary& s, uint64_t expected_partition_count, uint32_t min_index_interval) {
    assert(expected_partition_count >= 1);

-    auto min_index_interval = schema.min_index_interval();
    s.header.min_index_interval = min_index_interval;
    s.header.sampling_level = downsampling::BASE_SAMPLING_LEVEL;
    uint64_t max_expected_entries =
@@ -1222,8 +1256,7 @@ static void prepare_summary(summary& s, uint64_t expected_partition_count, const

 static void seal_summary(summary& s,
        std::experimental::optional<key>&& first_key,
-        std::experimental::optional<key>&& last_key,
-        const schema& schema) {
+        std::experimental::optional<key>&& last_key) {
    s.header.size = s.entries.size();
    s.header.size_at_full_sampling = s.header.size;

@@ -1312,7 +1345,7 @@ void sstable::do_write_components(::mutation_reader mr,
    auto filter_fp_chance = schema->bloom_filter_fp_chance();
    _filter = utils::i_filter::get_filter(estimated_partitions, filter_fp_chance);

-    prepare_summary(_summary, estimated_partitions, *schema);
+    prepare_summary(_summary, estimated_partitions, schema->min_index_interval());

    // FIXME: we may need to set repaired_at stats at this point.

@@ -1392,7 +1425,7 @@ void sstable::do_write_components(::mutation_reader mr,
        }

    }
-    seal_summary(_summary, std::move(first_key), std::move(last_key), *schema);
+    seal_summary(_summary, std::move(first_key), std::move(last_key));

    index->close().get();
    _index_file = file(); // index->close() closed _index_file
@@ -1465,6 +1498,60 @@ future<> sstable::write_components(::mutation_reader mr,
    });
 }

+future<> sstable::generate_summary(const io_priority_class& pc) {
+    if (_summary) {
+        return make_ready_future<>();
+    }
+
+    sstlog.info("Summary file {} not found. Generating Summary...", filename(sstable::component_type::Summary));
+    class summary_generator {
+        summary& _summary;
+    public:
+        std::experimental::optional<key> first_key, last_key;
+
+        summary_generator(summary& s) : _summary(s) {}
+        bool should_continue() {
+            return true;
+        }
+        void consume_entry(index_entry&& ie) {
+            maybe_add_summary_entry(_summary, ie.get_key_bytes(), ie.position());
+            if (!first_key) {
+                first_key = key(to_bytes(ie.get_key_bytes()));
+            } else {
+                last_key = key(to_bytes(ie.get_key_bytes()));
+            }
+        }
+    };
+
+    return open_file_dma(filename(component_type::Index), open_flags::ro).then([this, &pc] (file index_file) {
+        return do_with(std::move(index_file), [this, &pc] (file index_file) {
+            return index_file.size().then([this, &pc, index_file] (auto size) {
+                // an upper bound. Surely to be less than this.
+                auto estimated_partitions = size / sizeof(uint64_t);
+                // Since we don't have a summary, use a default min_index_interval, and if needed we'll resample
+                // later.
+                prepare_summary(_summary, estimated_partitions, 0x80);
+
+                file_input_stream_options options;
+                options.buffer_size = sstable_buffer_size;
+                options.io_priority_class = pc;
+                auto stream = make_file_input_stream(index_file, 0, size, std::move(options));
+                return do_with(summary_generator(_summary), [this, &pc, stream = std::move(stream), size] (summary_generator& s) mutable {
+                    auto ctx = make_lw_shared<index_consume_entry_context<summary_generator>>(s, std::move(stream), size);
+                    return ctx->consume_input(*ctx).then([this, ctx, &s] {
+                        seal_summary(_summary, std::move(s.first_key), std::move(s.last_key));
+                    });
+                });
+            }).then([index_file] () mutable {
+                return index_file.close().handle_exception([] (auto ep) {
+                    sstlog.warn("sstable close index_file failed: {}", ep);
+                    return make_exception_future<>(std::move(ep));
+                });
+            });
+        });
+    });
+}
+
 uint64_t sstable::data_size() const {
    if (has_component(sstable::component_type::CompressionInfo)) {
        return _compression.data_len;
@@ -1730,7 +1817,7 @@ sstable::~sstable() {
        // clean up unused sstables, and because we'll never reuse the same
        // generation number anyway.
        try {
-            shared_remove_by_toc_name(filename(component_type::TOC), _shared).handle_exception(
+            delete_atomically({sstable_to_delete(filename(component_type::TOC), _shared)}).handle_exception(
                        [op = background_jobs().start()] (std::exception_ptr eptr) {
                            sstlog.warn("Exception when deleting sstable file: {}", eptr);
                        });
@@ -1746,26 +1833,6 @@ dirname(sstring fname) {
    return boost::filesystem::canonical(std::string(fname)).parent_path().string();
 }

-future<>
-sstable::shared_remove_by_toc_name(sstring toc_name, bool shared) {
-    if (!shared) {
-        return remove_by_toc_name(toc_name);
-    } else {
-        auto shard = std::hash<sstring>()(toc_name) % smp::count;
-        return smp::submit_to(shard, [toc_name, src_shard = engine().cpu_id()] {
-            auto& remove_set = _shards_agreeing_to_remove_sstable[toc_name];
-            remove_set.insert(src_shard);
-            auto counter = remove_set.size();
-            if (counter == smp::count) {
-                _shards_agreeing_to_remove_sstable.erase(toc_name);
-                return remove_by_toc_name(toc_name);
-            } else {
-                return make_ready_future<>();
-            }
-        });
-    }
-}
-
 future<>
 fsync_directory(sstring fname) {
    return open_directory(dirname(fname)).then([] (file f) {
@@ -1778,16 +1845,23 @@ fsync_directory(sstring fname) {
 future<>
 remove_by_toc_name(sstring sstable_toc_name) {
    return seastar::async([sstable_toc_name] {
-        auto dir = dirname(sstable_toc_name);
-        auto toc_file = open_file_dma(sstable_toc_name, open_flags::ro).get0();
+        sstring prefix = sstable_toc_name.substr(0, sstable_toc_name.size() - TOC_SUFFIX.size());
+        auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
+        sstring dir;
+
+        if (file_exists(sstable_toc_name).get0()) {
+            dir = dirname(sstable_toc_name);
+            rename_file(sstable_toc_name, new_toc_name).get();
+            fsync_directory(dir).get();
+        } else {
+            dir = dirname(new_toc_name);
+        }
+
+        auto toc_file = open_file_dma(new_toc_name, open_flags::ro).get0();
        auto in = make_file_input_stream(toc_file);
        auto size = toc_file.size().get0();
        auto text = in.read_exactly(size).get0();
        in.close().get();
-        sstring prefix = sstable_toc_name.substr(0, sstable_toc_name.size() - TOC_SUFFIX.size());
-        auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
-        rename_file(sstable_toc_name, new_toc_name).get();
-        fsync_directory(dir).get();
        std::vector<sstring> components;
        sstring all(text.begin(), text.end());
        boost::split(components, all, boost::is_any_of("\n"));
@@ -1800,13 +1874,58 @@ remove_by_toc_name(sstring sstable_toc_name) {
                // already deleted
                return make_ready_future<>();
            }
-            return remove_file(prefix + component);
+            auto fname = prefix + component;
+            return remove_file(prefix + component).then_wrapped([fname = std::move(fname)] (future<> f) {
+                // forgive ENOENT, since the component may not have been written;
+                try {
+                    f.get();
+                } catch (std::system_error& e) {
+                    if (e.code() != std::error_code(ENOENT, std::system_category())) {
+                        throw;
+                    }
+                    sstlog.debug("Forgiving ENOENT when deleting file {}", fname);
+                }
+                return make_ready_future<>();
+            });
        }).get();
        fsync_directory(dir).get();
        remove_file(new_toc_name).get();
    });
 }

+future<>
+sstable::mark_for_deletion_on_disk() {
+    mark_for_deletion();
+
+    auto toc_name = filename(component_type::TOC);
+    auto shard = std::hash<sstring>()(toc_name) % smp::count;
+
+    return smp::submit_to(shard, [toc_name] {
+        static thread_local std::unordered_set<sstring> renaming;
+
+        if (renaming.count(toc_name) > 0) {
+            return make_ready_future<>();
+        }
+
+        renaming.emplace(toc_name);
+
+        return seastar::async([toc_name] {
+            if (!file_exists(toc_name).get0()) {
+                return; // already gone
+            }
+
+            auto dir = dirname(toc_name);
+            auto toc_file = open_file_dma(toc_name, open_flags::ro).get0();
+            sstring prefix = toc_name.substr(0, toc_name.size() - TOC_SUFFIX.size());
+            auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
+            rename_file(toc_name, new_toc_name).get();
+            fsync_directory(dir).get();
+        }).finally([toc_name] {
+            renaming.erase(toc_name);
+        });
+    });
+}
+
 future<>
 sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
    return seastar::async([ks, cf, dir, generation, v, f] {
@@ -1849,12 +1968,11 @@ sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64
 }

 future<range<partition_key>>
-sstable::get_sstable_key_range(const schema& s, sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
-    auto sst = std::make_unique<sstable>(ks, cf, dir, generation, v, f);
-    auto fut = sst->read_summary(default_priority_class());
-    return std::move(fut).then([sst = std::move(sst), &s] () mutable {
-        auto first = sst->get_first_partition_key(s);
-        auto last = sst->get_last_partition_key(s);
+sstable::get_sstable_key_range(const schema& s) {
+    auto fut = read_summary(default_priority_class());
+    return std::move(fut).then([this, &s] () mutable {
+        auto first = get_first_partition_key(s);
+        auto last = get_last_partition_key(s);
        return make_ready_future<range<partition_key>>(range<partition_key>::make(first, last));
    });
 }
@@ -1864,4 +1982,170 @@ void sstable::mark_sstable_for_deletion(sstring ks, sstring cf, sstring dir, int
    sst.mark_for_deletion();
 }

+std::ostream&
+operator<<(std::ostream& os, const sstable_to_delete& std) {
+    return os << std.name << "(" << (std.shared ? "shared" : "unshared") << ")";
+}
+
+using shards_agreeing_to_delete_sstable_type = std::unordered_set<shard_id>;
+using sstables_to_delete_atomically_type = std::set<sstring>;
+struct pending_deletion {
+    sstables_to_delete_atomically_type names;
+    std::vector<lw_shared_ptr<promise<>>> completions;
+};
+
+static thread_local bool g_atomic_deletions_cancelled = false;
+static thread_local std::list<lw_shared_ptr<pending_deletion>> g_atomic_deletion_sets;
+static thread_local std::unordered_map<sstring, shards_agreeing_to_delete_sstable_type> g_shards_agreeing_to_delete_sstable;
+
+static logging::logger deletion_logger("sstable-deletion");
+
+static
+future<>
+do_delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set, unsigned deleting_shard) {
+    // runs on shard 0 only
+    deletion_logger.debug("shard {} atomically deleting {}", deleting_shard, atomic_deletion_set);
+
+    if (g_atomic_deletions_cancelled) {
+        deletion_logger.debug("atomic deletions disabled, erroring out");
+        throw std::runtime_error(sprint("atomic deletions disabled; not deleting %s", atomic_deletion_set));
+    }
+
+    // Insert atomic_deletion_set into the list of sets pending deletion.  If the new set
+    // overlaps with an existing set, merge them (the merged set will be deleted atomically).
+    std::list<lw_shared_ptr<pending_deletion>> new_atomic_deletion_sets;
+    auto merged_set = make_lw_shared(pending_deletion());
+    for (auto&& sst_to_delete : atomic_deletion_set) {
+        merged_set->names.insert(sst_to_delete.name);
+        if (!sst_to_delete.shared) {
+            for (auto shard : boost::irange<shard_id>(0, smp::count)) {
+                g_shards_agreeing_to_delete_sstable[sst_to_delete.name].insert(shard);
+            }
+        }
+    }
+    merged_set->completions.push_back(make_lw_shared<promise<>>());
+    auto ret = merged_set->completions.back()->get_future();
+    for (auto&& old_set : g_atomic_deletion_sets) {
+         auto intersection = sstables_to_delete_atomically_type();
+         boost::set_intersection(merged_set->names, old_set->names, std::inserter(intersection, intersection.end()));
+         if (intersection.empty()) {
+             // We copy old_set to avoid corrupting g_atomic_deletion_sets if we fail
+             // further on.
+             new_atomic_deletion_sets.push_back(old_set);
+         } else {
+             deletion_logger.debug("merging with {}", old_set->names);
+             merged_set->names.insert(old_set->names.begin(), old_set->names.end());
+             boost::push_back(merged_set->completions, old_set->completions);
+         }
+    }
+    deletion_logger.debug("new atomic set: {}", merged_set->names);
+    new_atomic_deletion_sets.push_back(merged_set);
+    // can now exception-safely commit:
+    g_atomic_deletion_sets = std::move(new_atomic_deletion_sets);
+
+    // Mark each sstable as being deleted from deleting_shard.  We have to do
+    // this in a separate pass, so the consideration whether we can delete or not
+    // sees all the data from this pass.
+    for (auto&& sst : atomic_deletion_set) {
+        g_shards_agreeing_to_delete_sstable[sst.name].insert(deleting_shard);
+    }
+
+    // Figure out if the (possibly merged) set can be deleted
+    for (auto&& sst : merged_set->names) {
+        if (g_shards_agreeing_to_delete_sstable[sst].size() != smp::count) {
+            // Not everyone agrees, leave the set pending
+            deletion_logger.debug("deferring deletion until all shards agree");
+            return ret;
+        }
+    }
+
+    // Cannot recover from a failed deletion
+    g_atomic_deletion_sets.pop_back();
+    for (auto&& name : merged_set->names) {
+        g_shards_agreeing_to_delete_sstable.erase(name);
+    }
+
+    // Everyone agrees, let's delete
+    // FIXME: this needs to be done atomically (using a log file of sstables we intend to delete)
+    parallel_for_each(merged_set->names, [] (sstring name) {
+        deletion_logger.debug("deleting {}", name);
+        return remove_by_toc_name(name);
+    }).then_wrapped([merged_set] (future<> result) {
+        deletion_logger.debug("atomic deletion completed: {}", merged_set->names);
+        shared_future<> sf(std::move(result));
+        for (auto&& comp : merged_set->completions) {
+            sf.get_future().forward_to(std::move(*comp));
+        }
+    });
+
+    return ret;
+}
+
+struct pending_shard_deletes {
+    std::unordered_map<int, promise<>> pending_deletes;
+    int idgen = 0;
+    future<> delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set);
+    void acknowledge(int id, std::exception_ptr ex);
+};
+
+static thread_local pending_shard_deletes this_shard_deletes;
+
+future<>
+pending_shard_deletes::delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set) {
+    auto i = pending_deletes.emplace(idgen++, promise<>()).first;
+    auto idx = i->first;
+    auto fut = i->second.get_future();
+    auto deleting_shard = engine().cpu_id();
+    smp::submit_to(0, [atomic_deletion_set, deleting_shard, idx] {
+        futurize<void>::apply(do_delete_atomically, atomic_deletion_set, deleting_shard).then_wrapped([deleting_shard, idx] (future<> ret) {
+            std::exception_ptr ex;
+            if (ret.failed()) {
+                ex = ret.get_exception();
+            }
+            return smp::submit_to(deleting_shard, [idx, ex] () mutable {
+                this_shard_deletes.acknowledge(idx, ex);
+            });
+        });
+    });
+    return fut;
+}
+
+void
+pending_shard_deletes::acknowledge(int idx, std::exception_ptr ex) {
+    auto i = pending_deletes.find(idx);
+    auto& pr = i->second;
+    if (ex) {
+        pr.set_exception(ex);
+    } else {
+        pr.set_value();
+    }
+    pending_deletes.erase(i);
+}
+
+future<>
+delete_atomically(std::vector<sstable_to_delete> ssts) {
+    return this_shard_deletes.delete_atomically(std::move(ssts));
+}
+
+future<>
+delete_atomically(std::vector<shared_sstable> ssts) {
+    std::vector<sstable_to_delete> sstables_to_delete_atomically;
+    for (auto&& sst : ssts) {
+        sstables_to_delete_atomically.push_back({sst->toc_filename(), sst->is_shared()});
+    }
+    return delete_atomically(std::move(sstables_to_delete_atomically));
+}
+
+void
+cancel_atomic_deletions() {
+    g_atomic_deletions_cancelled = true;
+    for (auto&& pd : g_atomic_deletion_sets) {
+        for (auto&& c : pd->completions) {
+            c->set_exception(std::runtime_error(sprint("Atomic sstable deletions cancelled; not deleting %s", pd->names)));
+        }
+    }
+    g_atomic_deletion_sets.clear();
+    g_shards_agreeing_to_delete_sstable.clear();
+}
+
 }
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -245,6 +245,8 @@ public:
        _marked_for_deletion = true;
    }

+    future<> mark_for_deletion_on_disk();
+
    bool marked_for_deletion() const {
        return _marked_for_deletion;
    }
@@ -339,11 +341,9 @@ private:
    void prepare_write_components(::mutation_reader mr,
            uint64_t estimated_partitions, schema_ptr schema, uint64_t max_sstable_size,
            const io_priority_class& pc);
-    static future<> shared_remove_by_toc_name(sstring toc_name, bool shared);
    static std::unordered_map<version_types, sstring, enum_hash<version_types>> _version_string;
    static std::unordered_map<format_types, sstring, enum_hash<format_types>> _format_string;
    static std::unordered_map<component_type, sstring, enum_hash<component_type>> _component_map;
-    static thread_local std::unordered_map<sstring, std::unordered_set<unsigned>> _shards_agreeing_to_remove_sstable;

    std::unordered_set<component_type, enum_hash<component_type>> _components;

@@ -397,13 +397,16 @@ private:

    void write_filter(const io_priority_class& pc);

-    future<> read_summary(const io_priority_class& pc) {
-        return read_simple<component_type::Summary>(_summary, pc);
-    }
+    future<> read_summary(const io_priority_class& pc);
+
    void write_summary(const io_priority_class& pc) {
        write_simple<component_type::Summary>(_summary, pc);
    }

+    // To be called when we try to load an SSTable that lacks a Summary. Could
+    // happen if old tools are being used.
+    future<> generate_summary(const io_priority_class& pc);
+
    future<> read_statistics(const io_priority_class& pc);
    void write_statistics(const io_priority_class& pc);

@@ -533,8 +536,8 @@ public:
    }

    // Return sstable key range as range<partition_key> reading only the summary component.
-    static future<range<partition_key>>
-    get_sstable_key_range(const schema& s, sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f);
+    future<range<partition_key>>
+    get_sstable_key_range(const schema& s);

    // Used to mark a sstable for deletion that is not relevant to the current shard.
    // It doesn't mean that the sstable will be deleted, but that the sstable is not
@@ -581,4 +584,31 @@ future<> await_background_jobs();
 // Invokes await_background_jobs() on all shards
 future<> await_background_jobs_on_all_shards();

+struct sstable_to_delete {
+    sstable_to_delete(sstring name, bool shared) : name(std::move(name)), shared(shared) {}
+    sstring name;
+    bool shared = false;
+    friend std::ostream& operator<<(std::ostream& os, const sstable_to_delete& std);
+};
+
+
+// When we compact sstables, we have to atomically instantiate the new
+// sstable and delete the old ones.  Otherwise, if we compact A+B into C,
+// and if A contained some data that was tombstoned by B, and if B was
+// deleted but A survived, then data from A will be resurrected.
+//
+// There are two violators of the requirement to atomically delete
+// sstables: first sstable instantiation and deletion on disk is atomic
+// only wrt. itself, not other sstables, and second when an sstable is
+// shared among shard, so actual on-disk deletion of an sstable is deferred
+// until all shards agree it can be deleted.
+//
+// This function only solves the second problem for now.
+future<> delete_atomically(std::vector<shared_sstable> ssts);
+future<> delete_atomically(std::vector<sstable_to_delete> ssts);
+
+// Cancel any deletions scheduled by delete_atomically() and make their
+// futures complete
+void cancel_atomic_deletions();
+
 }
--- a/sstables/types.hh
+++ b/sstables/types.hh
@@ -144,6 +144,10 @@ struct summary_ka {
    uint64_t memory_footprint() const {
        return sizeof(summary_entry) * entries.size() + sizeof(uint32_t) * positions.size() + sizeof(*this);
    }
+
+    explicit operator bool() const {
+        return entries.size();
+    }
 };
 using summary = summary_ka;

@@ -262,6 +266,13 @@ struct deletion_time {
               (marked_for_delete_at == std::numeric_limits<int64_t>::min());
    }

+    bool operator==(const deletion_time& d) {
+        return local_deletion_time == d.local_deletion_time &&
+               marked_for_delete_at == d.marked_for_delete_at;
+    }
+    bool operator!=(const deletion_time& d) {
+        return !(*this == d);
+    }
    explicit operator tombstone() {
        return tombstone(marked_for_delete_at, gc_clock::time_point(gc_clock::duration(local_deletion_time)));
    }
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -103,8 +103,6 @@ void stream_session::init_messaging_service_handler() {
            auto session = get_session(plan_id, from, "PREPARE_MESSAGE");
            session->init(sr);
            session->dst_cpu_id = src_cpu_id;
-            sslog.debug("[Stream #{}] GOT PREPARE_MESSAGE from {}: get session peer={}, dst_cpu_id={}",
-                session->plan_id(), from, session->peer, session->dst_cpu_id);
            return session->prepare(std::move(msg.requests), std::move(msg.summaries));
        });
    });
@@ -123,13 +121,27 @@ void stream_session::init_messaging_service_handler() {
            get_local_stream_manager().update_progress(plan_id, from.addr, progress_info::direction::IN, fm_size);
            return service::get_schema_for_write(fm.schema_version(), from).then([plan_id, from, &fm] (schema_ptr s) {
                auto cf_id = fm.column_family_id();
+                sslog.debug("[Stream #{}] GOT STREAM_MUTATION from {}: cf_id={}", plan_id, from.addr, cf_id);
+
                auto& db = service::get_local_storage_proxy().get_db().local();
                if (!db.column_family_exists(cf_id)) {
-                    sslog.debug("[Stream #{}] STREAM_MUTATION from {}: cf_id={} is missing, assume the table is dropped",
+                    sslog.warn("[Stream #{}] STREAM_MUTATION from {}: cf_id={} is missing, assume the table is dropped",
                                plan_id, from.addr, cf_id);
                    return make_ready_future<>();
                }
-                return service::get_storage_proxy().local().mutate_locally(std::move(s), fm);
+                return service::get_storage_proxy().local().mutate_streaming_mutation(std::move(s), fm).then_wrapped([plan_id, cf_id, from] (auto&& f) {
+                    try {
+                        f.get();
+                        return make_ready_future<>();
+                    } catch (no_such_column_family) {
+                        sslog.warn("[Stream #{}] STREAM_MUTATION from {}: cf_id={} is missing, assume the table is dropped",
+                                plan_id, from.addr, cf_id);
+                        return make_ready_future<>();
+                    } catch (...) {
+                        throw;
+                    }
+                    return make_ready_future<>();
+                });
            });
        });
    });
@@ -137,18 +149,29 @@ void stream_session::init_messaging_service_handler() {
        const auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
        return smp::submit_to(dst_cpu_id, [ranges = std::move(ranges), plan_id, cf_id, from] () mutable {
            auto session = get_session(plan_id, from, "STREAM_MUTATION_DONE", cf_id);
-            session->receive_task_completed(cf_id);
            return session->get_db().invoke_on_all([ranges = std::move(ranges), plan_id, from, cf_id] (database& db) {
                if (!db.column_family_exists(cf_id)) {
-                    sslog.debug("[Stream #{}] STREAM_MUTATION_DONE from {}: cf_id={} is missing, assume the table is dropped",
+                    sslog.warn("[Stream #{}] STREAM_MUTATION_DONE from {}: cf_id={} is missing, assume the table is dropped",
                                plan_id, from, cf_id);
                    return make_ready_future<>();
                }
-                auto& cf = db.find_column_family(cf_id);
-                for (auto& range : ranges) {
-                    cf.get_row_cache().invalidate(query::to_partition_range(range));
+                std::vector<query::partition_range> query_ranges;
+                try {
+                    auto& cf = db.find_column_family(cf_id);
+                    query_ranges.reserve(ranges.size());
+                    for (auto& range : ranges) {
+                        query_ranges.push_back(query::to_partition_range(range));
+                    }
+                    return cf.flush_streaming_mutations(std::move(query_ranges));
+                } catch (no_such_column_family) {
+                    sslog.warn("[Stream #{}] STREAM_MUTATION_DONE from {}: cf_id={} is missing, assume the table is dropped",
+                                plan_id, from, cf_id);
+                    return make_ready_future<>();
+                } catch (...) {
+                    throw;
                }
-                return make_ready_future<>();
+            }).then([session, cf_id] {
+                session->receive_task_completed(cf_id);
            });
        });
    });
--- a/streaming/stream_transfer_task.cc
+++ b/streaming/stream_transfer_task.cc
@@ -109,7 +109,7 @@ future<stop_iteration> do_send_mutations(auto si, auto fm) {

 future<> send_mutations(auto si) {
    auto& cf = si->db.find_column_family(si->cf_id);
-    auto& priority = service::get_local_mutation_stream_priority();
+    auto& priority = service::get_local_streaming_read_priority();
    return do_with(cf.make_reader(cf.schema(), si->pr, priority), [si] (auto& reader) {
        return repeat([si, &reader] () {
            return reader().then([si] (auto mopt) {
--- a/tests/cql_query_test.cc
+++ b/tests/cql_query_test.cc
@@ -2098,6 +2098,24 @@ SEASTAR_TEST_CASE(test_alter_table) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_map_query) {
+    return do_with_cql_env([] (auto& e) {
+        return seastar::async([&e] {
+            e.execute_cql("CREATE TABLE xx (k int PRIMARY KEY, m map<text, int>);").get();
+            e.execute_cql("insert into xx (k, m) values (0, {'v2': 1});").get();
+            auto m_type = map_type_impl::get_instance(utf8_type, int32_type, true);
+            assert_that(e.execute_cql("select m from xx where k = 0;").get0())
+                    .is_rows().with_rows({
+                        { make_map_value(m_type, map_type_impl::native_type({{sstring("v2"), 1}})).serialize() }
+                    });
+            e.execute_cql("delete m['v2'] from xx where k = 0;").get();
+            assert_that(e.execute_cql("select m from xx where k = 0;").get0())
+                    .is_rows().with_rows({{{}}});
+        });
+    });
+}
+
 SEASTAR_TEST_CASE(test_drop_table) {
    return do_with_cql_env([] (auto& e) {
        return seastar::async([&e] {
@@ -2109,6 +2127,40 @@ SEASTAR_TEST_CASE(test_drop_table) {
    });
 }

+SEASTAR_TEST_CASE(test_reversed_slice_with_empty_range_before_all_rows) {
+    return do_with_cql_env([] (auto& e) {
+        return seastar::async([&e] {
+            e.execute_cql("CREATE TABLE test (a int, b int, c int, s1 int static, s2 int static, PRIMARY KEY (a, b));").get();
+
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 0, 0, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 1, 1, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 2, 2, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 3, 3, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 4, 4, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 5, 5, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 6, 6, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 7, 7, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 8, 8, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 9, 9, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 10, 10, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 11, 11, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 12, 12, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 13, 13, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 14, 14, 17, 42);").get();
+            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 15, 15, 17, 42);").get();
+
+            assert_that(e.execute_cql("select * from test WHERE a = 99 and b < 0 ORDER BY b DESC limit 2;").get0())
+                .is_rows().is_empty();
+
+            assert_that(e.execute_cql("select * from test WHERE a = 99 order by b desc;").get0())
+                .is_rows().with_size(16);
+
+            assert_that(e.execute_cql("select * from test;").get0())
+                .is_rows().with_size(16);
+        });
+    });
+}
+
 SEASTAR_TEST_CASE(test_alter_table_validation) {
    return do_with_cql_env([] (auto& e) {
        return e.execute_cql("create table tatv (p1 int, c1 int, c2 int, r1 int, r2 set<int>, PRIMARY KEY (p1, c1, c2));").discard_result().then_wrapped([&e] (auto f) {
--- a/tests/cql_test_env.cc
+++ b/tests/cql_test_env.cc
@@ -333,7 +333,6 @@ public:

            gms::get_gossiper().stop().get();
            gms::get_failure_detector().stop().get();
-            net::get_messaging_service().stop().get();

            _db->stop().get();

@@ -343,6 +342,8 @@ public:

            sstables::await_background_jobs_on_all_shards().get();

+            net::get_messaging_service().stop().get();
+
            bool old_active = true;
            assert(active.compare_exchange_strong(old_active, false));
        });
--- a/tests/gossip_test.cc
+++ b/tests/gossip_test.cc
@@ -46,9 +46,9 @@ SEASTAR_TEST_CASE(test_boot_shutdown){
        gms::get_gossiper().start().get();
        gms::get_gossiper().stop().get();
        gms::get_failure_detector().stop().get();
-        net::get_messaging_service().stop().get();
        db.stop().get();
        service::get_storage_service().stop().get();
+        net::get_messaging_service().stop().get();
        locator::i_endpoint_snitch::stop_snitch().get();
    });
 }
--- a/tests/logalloc_test.cc
+++ b/tests/logalloc_test.cc
@@ -108,7 +108,7 @@ SEASTAR_TEST_CASE(test_compaction_with_multiple_regions) {
            }
        });

-        size_t quarter = shard_tracker().occupancy().total_space() / 4;
+        size_t quarter = shard_tracker().region_occupancy().total_space() / 4;

        shard_tracker().reclaim_all_free_segments();

--- a/tests/mutation_source_test.cc
+++ b/tests/mutation_source_test.cc
@@ -298,6 +298,15 @@ static mutation_sets generate_mutation_sets() {
        }
    }

+    {
+        random_mutation_generator gen;
+        for (int i = 0; i < 10; ++i) {
+            auto m = gen();
+            result.unequal.emplace_back(mutations{m, gen()}); // collision unlikely
+            result.equal.emplace_back(mutations{m, m});
+        }
+    }
+
    return result;
 }

@@ -341,3 +350,145 @@ void for_each_mutation(std::function<void(const mutation&)> callback) {
        }
    }
 }
+
+bytes make_blob(size_t blob_size) {
+    static thread_local std::independent_bits_engine<std::default_random_engine, 8, uint8_t> random_bytes;
+    bytes big_blob(bytes::initialized_later(), blob_size);
+    for (auto&& b : big_blob) {
+        b = random_bytes();
+    }
+    return big_blob;
+};
+
+class random_mutation_generator::impl {
+    friend class random_mutation_generator;
+    const size_t _external_blob_size = 128; // Should be enough to force use of external bytes storage
+    const column_id column_count = row::max_vector_size * 2;
+    std::mt19937 _gen;
+    schema_ptr _schema;
+    std::vector<bytes> _blobs;
+
+    static gc_clock::time_point expiry_dist(auto& gen) {
+        static thread_local std::uniform_int_distribution<int> dist(0, 2);
+        return gc_clock::time_point() + std::chrono::seconds(dist(gen));
+    }
+
+public:
+    schema_ptr make_schema() {
+        auto builder = schema_builder("ks", "cf")
+                .with_column("pk", bytes_type, column_kind::partition_key)
+                .with_column("ck1", bytes_type, column_kind::clustering_key)
+                .with_column("ck2", bytes_type, column_kind::clustering_key);
+
+        // Create enough columns so that row can overflow its vector storage
+        for (column_id i = 0; i < column_count; ++i) {
+            {
+                auto column_name = sprint("v%d", i);
+                builder.with_column(to_bytes(column_name), bytes_type, column_kind::regular_column);
+            }
+            {
+                auto column_name = sprint("s%d", i);
+                builder.with_column(to_bytes(column_name), bytes_type, column_kind::static_column);
+            }
+        }
+
+        return builder.build();
+    }
+
+    impl() {
+        _schema = make_schema();
+
+        for (int i = 0; i < 1024; ++i) {
+            _blobs.emplace_back(make_blob(_external_blob_size));
+        }
+
+        std::random_device rd;
+        // In case of errors, replace the seed with a fixed value to get a deterministic run.
+        auto seed = rd();
+        BOOST_TEST_MESSAGE(sprint("Random seed: %s", seed));
+        _gen = std::mt19937(seed);
+    }
+
+    mutation operator()() {
+        std::uniform_int_distribution<column_id> column_count_dist(1, column_count);
+        std::uniform_int_distribution<column_id> column_id_dist(0, column_count - 1);
+        std::uniform_int_distribution<size_t> value_blob_index_dist(0, 2);
+        std::normal_distribution<> ck_index_dist(_blobs.size() / 2, 1.5);
+        std::uniform_int_distribution<int> bool_dist(0, 1);
+
+        std::uniform_int_distribution<api::timestamp_type> timestamp_dist(api::min_timestamp, api::min_timestamp + 2); // 3 values
+
+        auto pkey = partition_key::from_single_value(*_schema, _blobs[0]);
+        mutation m(pkey, _schema);
+
+        auto set_random_cells = [&] (row& r, column_kind kind) {
+            auto columns_to_set = column_count_dist(_gen);
+            for (column_id i = 0; i < columns_to_set; ++i) {
+                // FIXME: generate expiring cells
+                auto cell = bool_dist(_gen)
+                            ? atomic_cell::make_live(timestamp_dist(_gen), _blobs[value_blob_index_dist(_gen)])
+                            : atomic_cell::make_dead(timestamp_dist(_gen), expiry_dist(_gen));
+                r.apply(_schema->column_at(kind, column_id_dist(_gen)), std::move(cell));
+            }
+        };
+
+        auto random_tombstone = [&] {
+            return tombstone(timestamp_dist(_gen), expiry_dist(_gen));
+        };
+
+        auto random_row_marker = [&] {
+            static thread_local std::uniform_int_distribution<int> dist(0, 3);
+            switch (dist(_gen)) {
+                case 0: return row_marker();
+                case 1: return row_marker(random_tombstone());
+                case 2: return row_marker(timestamp_dist(_gen));
+                case 3: return row_marker(timestamp_dist(_gen), std::chrono::seconds(1), expiry_dist(_gen));
+                default: assert(0);
+            }
+        };
+
+        if (bool_dist(_gen)) {
+            m.partition().apply(random_tombstone());
+        }
+
+        set_random_cells(m.partition().static_row(), column_kind::static_column);
+
+        auto random_blob = [&] {
+            return _blobs[std::min(_blobs.size() - 1, static_cast<size_t>(std::max(0.0, ck_index_dist(_gen))))];
+        };
+
+        auto row_count_dist = [&] (auto& gen) {
+            static thread_local std::normal_distribution<> dist(32, 1.5);
+            return static_cast<size_t>(std::min(100.0, std::max(0.0, dist(gen))));
+        };
+
+        size_t row_count = row_count_dist(_gen);
+        for (size_t i = 0; i < row_count; ++i) {
+            auto ckey = clustering_key::from_exploded(*_schema, {random_blob(), random_blob()});
+            deletable_row& row = m.partition().clustered_row(ckey);
+            set_random_cells(row.cells(), column_kind::regular_column);
+            row.marker() = random_row_marker();
+        }
+
+        size_t range_tombstone_count = row_count_dist(_gen);
+        for (size_t i = 0; i < range_tombstone_count; ++i) {
+            auto key = clustering_key::from_exploded(*_schema, {random_blob()});
+            m.partition().apply_row_tombstone(*_schema, key, random_tombstone());
+        }
+        return m;
+    }
+};
+
+random_mutation_generator::~random_mutation_generator() {}
+
+random_mutation_generator::random_mutation_generator()
+    : _impl(std::make_unique<random_mutation_generator::impl>())
+{ }
+
+mutation random_mutation_generator::operator()() {
+    return (*_impl)();
+}
+
+schema_ptr random_mutation_generator::schema() const {
+    return _impl->_schema;
+}
--- a/tests/mutation_source_test.hh
+++ b/tests/mutation_source_test.hh
@@ -36,3 +36,15 @@ void for_each_mutation_pair(std::function<void(const mutation&, const mutation&,

 // Calls the provided function on mutations. Is supposed to exercise as many differences as possible.
 void for_each_mutation(std::function<void(const mutation&)>);
+
+class random_mutation_generator {
+    class impl;
+    std::unique_ptr<impl> _impl;
+public:
+    random_mutation_generator();
+    ~random_mutation_generator();
+    mutation operator()();
+    schema_ptr schema() const;
+};
+
+bytes make_blob(size_t blob_size);
--- a/tests/mutation_test.cc
+++ b/tests/mutation_test.cc
@@ -25,6 +25,7 @@
 #include <boost/range/adaptor/transformed.hpp>
 #include <boost/range/algorithm/copy.hpp>
 #include <boost/range/algorithm_ext/push_back.hpp>
+#include "mutation_query.hh"
 #include "md5_hasher.hh"

 #include "core/sstring.hh"
@@ -270,6 +271,7 @@ SEASTAR_TEST_CASE(test_list_mutations) {
 }

 SEASTAR_TEST_CASE(test_multiple_memtables_one_partition) {
+    return seastar::async([] {
    auto s = make_lw_shared(schema({}, some_keyspace, some_column_family,
        {{"p1", utf8_type}}, {{"c1", int32_type}}, {{"r1", int32_type}}, {}, utf8_type));

@@ -280,7 +282,7 @@ SEASTAR_TEST_CASE(test_multiple_memtables_one_partition) {
    cfg.enable_incremental_backups = false;
    cfg.cf_stats = &*cf_stats;

-    return with_column_family(s, cfg, [s] (column_family& cf) {
+    with_column_family(s, cfg, [s] (column_family& cf) {
        const column_definition& r1_col = *s->get_column_definition("r1");
        auto key = partition_key::from_exploded(*s, {to_bytes("key1")});

@@ -291,26 +293,30 @@ SEASTAR_TEST_CASE(test_multiple_memtables_one_partition) {
            cf.apply(std::move(m));
            return cf.flush();
        };
-        return when_all(
-                insert_row(1001, 2001),
-                insert_row(1002, 2002),
-                insert_row(1003, 2003)).discard_result().then([s, &r1_col, &cf, key] {
+        insert_row(1001, 2001).get();
+        insert_row(1002, 2002).get();
+        insert_row(1003, 2003).get();
+        {
            auto verify_row = [&] (int32_t c1, int32_t r1) {
                auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(c1)});
-                return cf.find_row(cf.schema(), dht::global_partitioner().decorate_key(*s, key), std::move(c_key)).then([r1, r1_col] (auto r) {
+                auto p_key = dht::global_partitioner().decorate_key(*s, key);
+                auto r = cf.find_row(cf.schema(), p_key, c_key).get0();
+                {
                    BOOST_REQUIRE(r);
                    auto i = r->find_cell(r1_col.id);
                    BOOST_REQUIRE(i);
                    auto cell = i->as_atomic_cell();
                    BOOST_REQUIRE(cell.is_live());
                    BOOST_REQUIRE(int32_type->equal(cell.value(), int32_type->decompose(r1)));
-                });
+                }
            };
            verify_row(1001, 2001);
            verify_row(1002, 2002);
            verify_row(1003, 2003);
-        });
-    }).then([cf_stats] {});
+        }
+        return make_ready_future<>();
+    }).get();
+    });
 }

 SEASTAR_TEST_CASE(test_flush_in_the_middle_of_a_scan) {
@@ -690,6 +696,165 @@ SEASTAR_TEST_CASE(test_row_counting) {
    });
 }

+SEASTAR_TEST_CASE(test_tombstone_apply) {
+    auto s = schema_builder("ks", "cf")
+            .with_column("pk", bytes_type, column_kind::partition_key)
+            .with_column("v", bytes_type, column_kind::regular_column)
+            .build();
+
+    auto pkey = partition_key::from_single_value(*s, "key1");
+
+    mutation m1(pkey, s);
+
+    BOOST_REQUIRE_EQUAL(m1.partition().partition_tombstone(), tombstone());
+
+    mutation m2(pkey, s);
+    auto tomb = tombstone(api::new_timestamp(), gc_clock::now());
+    m2.partition().apply(tomb);
+    BOOST_REQUIRE_EQUAL(m2.partition().partition_tombstone(), tomb);
+
+    m1.apply(m2);
+
+    BOOST_REQUIRE_EQUAL(m1.partition().partition_tombstone(), tomb);
+
+    return make_ready_future<>();
+}
+
+SEASTAR_TEST_CASE(test_marker_apply) {
+    auto s = schema_builder("ks", "cf")
+            .with_column("pk", bytes_type, column_kind::partition_key)
+            .with_column("ck", bytes_type, column_kind::clustering_key)
+            .with_column("v", bytes_type, column_kind::regular_column)
+            .build();
+
+    auto pkey = partition_key::from_single_value(*s, "pk1");
+    auto ckey = clustering_key::from_single_value(*s, "ck1");
+
+    auto mutation_with_marker = [&] (row_marker rm) {
+        mutation m(pkey, s);
+        m.partition().clustered_row(ckey).marker() = rm;
+        return m;
+    };
+
+    {
+        mutation m(pkey, s);
+        auto marker = row_marker(api::new_timestamp());
+        auto mm = mutation_with_marker(marker);
+        m.apply(mm);
+        BOOST_REQUIRE_EQUAL(m.partition().clustered_row(ckey).marker(), marker);
+    }
+
+    {
+        mutation m(pkey, s);
+        auto marker = row_marker(api::new_timestamp(), std::chrono::seconds(1), gc_clock::now());
+        m.apply(mutation_with_marker(marker));
+        BOOST_REQUIRE_EQUAL(m.partition().clustered_row(ckey).marker(), marker);
+    }
+
+    return make_ready_future<>();
+}
+
+class failure_injecting_allocation_strategy : public allocation_strategy {
+    allocation_strategy& _delegate;
+    uint64_t _alloc_count;
+    uint64_t _fail_at = std::numeric_limits<uint64_t>::max();
+public:
+    failure_injecting_allocation_strategy(allocation_strategy& delegate) : _delegate(delegate) {}
+
+    virtual void* alloc(migrate_fn mf, size_t size, size_t alignment) override {
+        if (_alloc_count >= _fail_at) {
+            stop_failing();
+            throw std::bad_alloc();
+        }
+        ++_alloc_count;
+        return _delegate.alloc(mf, size, alignment);
+    }
+
+    virtual void free(void* ptr) override {
+        _delegate.free(ptr);
+    }
+
+    // Counts allocation attempts which are not failed due to fail_at().
+    uint64_t alloc_count() const {
+        return _alloc_count;
+    }
+
+    void fail_after(uint64_t count) {
+        _fail_at = _alloc_count + count;
+    }
+
+    void stop_failing() {
+        _fail_at = std::numeric_limits<uint64_t>::max();
+    }
+};
+
+SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) {
+    random_mutation_generator gen;
+
+    failure_injecting_allocation_strategy alloc(standard_allocator());
+    with_allocator(alloc, [&] {
+        auto target = gen();
+
+        BOOST_TEST_MESSAGE(sprint("Target: %s", target));
+
+        for (int i = 0; i < 10; ++i) {
+            auto second = gen();
+
+            BOOST_TEST_MESSAGE(sprint("Second: %s", second));
+
+            auto expected_apply_result = target;
+            expected_apply_result.apply(second);
+
+            BOOST_TEST_MESSAGE(sprint("Expected: %s", expected_apply_result));
+
+            // Test the apply(const mutation&) variant
+            {
+                auto m = target;
+
+                // Try to fail at every possible allocation point during apply()
+                size_t fail_offset = 0;
+                while (true) {
+                    BOOST_TEST_MESSAGE(sprint("Failing allocation at %d", fail_offset));
+                    alloc.fail_after(fail_offset++);
+                    try {
+                        m.apply(second);
+                        alloc.stop_failing();
+                        BOOST_TEST_MESSAGE("Checking that apply has expected result");
+                        assert_that(m).is_equal_to(expected_apply_result);
+                        break; // we exhausted all allocation points
+                    } catch (const std::bad_alloc&) {
+                        BOOST_TEST_MESSAGE("Checking that apply was reverted");
+                        assert_that(m).is_equal_to(target);
+                    }
+                }
+            }
+
+            // Test the apply(mutation&&) variant
+            {
+                size_t fail_offset = 0;
+                while (true) {
+                    auto copy_of_second = second;
+                    auto m = target;
+                    alloc.fail_after(fail_offset++);
+                    try {
+                        m.apply(std::move(copy_of_second));
+                        alloc.stop_failing();
+                        assert_that(m).is_equal_to(expected_apply_result);
+                        break; // we exhausted all allocation points
+                    } catch (const std::bad_alloc&) {
+                        assert_that(m).is_equal_to(target);
+                        // they should still commute
+                        m.apply(copy_of_second);
+                        assert_that(m).is_equal_to(expected_apply_result);
+                    }
+                }
+            }
+        }
+    });
+
+    return make_ready_future<>();
+}
+
 SEASTAR_TEST_CASE(test_mutation_diff) {
    return seastar::async([] {
        auto my_set_type = set_type_impl::get_instance(int32_type, true);
@@ -805,15 +970,6 @@ SEASTAR_TEST_CASE(test_large_blobs) {

        auto mt = make_lw_shared<memtable>(s);

-        auto make_blob = [] (size_t blob_size) -> bytes {
-            bytes big_blob(bytes::initialized_later(), blob_size);
-            std::independent_bits_engine<std::default_random_engine, 8, uint8_t> random_bytes;
-            for (auto&& b : big_blob) {
-                b = random_bytes();
-            }
-            return big_blob;
-        };
-
        auto blob1 = make_blob(1234567);
        auto blob2 = make_blob(2345678);

@@ -884,6 +1040,55 @@ SEASTAR_TEST_CASE(test_mutation_hash) {
    });
 }

+static mutation compacted(const mutation& m) {
+    auto result = m;
+    result.partition().compact_for_compaction(*result.schema(), api::max_timestamp, gc_clock::now());
+    return result;
+}
+
+SEASTAR_TEST_CASE(test_query_digest) {
+    return seastar::async([] {
+        auto check_digests_equal = [] (const mutation& m1, const mutation& m2) {
+            auto ps1 = partition_slice_builder(*m1.schema()).build();
+            auto ps2 = partition_slice_builder(*m2.schema()).build();
+            auto digest1 = *m1.query(ps1, query::result_request::only_digest).digest();
+            auto digest2 = *m2.query(ps2, query::result_request::only_digest).digest();
+            if (digest1 != digest2) {
+                BOOST_FAIL(sprint("Digest should be the same for %s and %s", m1, m2));
+            }
+        };
+
+        for_each_mutation_pair([&] (const mutation& m1, const mutation& m2, are_equal eq) {
+            if (m1.schema()->version() != m2.schema()->version()) {
+                return;
+            }
+
+            if (eq) {
+                check_digests_equal(compacted(m1), m2);
+                check_digests_equal(m1, compacted(m2));
+            } else {
+                BOOST_TEST_MESSAGE("If not equal, they should become so after applying diffs mutually");
+
+                schema_ptr s = m1.schema();
+
+                auto m3 = m2;
+                {
+                    auto diff = m1.partition().difference(s, m2.partition());
+                    m3.partition().apply(*m3.schema(), std::move(diff));
+                }
+
+                auto m4 = m1;
+                {
+                    auto diff = m2.partition().difference(s, m1.partition());
+                    m4.partition().apply(*m4.schema(), std::move(diff));
+                }
+
+                check_digests_equal(m3, m4);
+            }
+        });
+    });
+}
+
 SEASTAR_TEST_CASE(test_mutation_upgrade_of_equal_mutations) {
    return seastar::async([] {
        for_each_mutation_pair([](auto&& m1, auto&& m2, are_equal eq) {
@@ -995,6 +1200,95 @@ SEASTAR_TEST_CASE(test_mutation_upgrade) {
    });
 }

+SEASTAR_TEST_CASE(test_querying_expired_cells) {
+    return seastar::async([] {
+        auto s = schema_builder("ks", "cf")
+                .with_column("pk", bytes_type, column_kind::partition_key)
+                .with_column("ck", bytes_type, column_kind::clustering_key)
+                .with_column("s1", bytes_type, column_kind::static_column)
+                .with_column("s2", bytes_type, column_kind::static_column)
+                .with_column("s3", bytes_type, column_kind::static_column)
+                .with_column("v1", bytes_type)
+                .with_column("v2", bytes_type)
+                .with_column("v3", bytes_type)
+                .build();
+
+        auto pk = partition_key::from_singular(*s, data_value(bytes("key1")));
+        auto ckey1 = clustering_key::from_singular(*s, data_value(bytes("A")));
+
+        auto ttl = std::chrono::seconds(1);
+        auto t1 = gc_clock::now();
+        auto t2 = t1 + std::chrono::seconds(1);
+        auto t3 = t2 + std::chrono::seconds(1);
+        auto t4 = t3 + std::chrono::seconds(1);
+
+        auto v1 = data_value(bytes("1"));
+        auto v2 = data_value(bytes("2"));
+        auto v3 = data_value(bytes("3"));
+
+        auto results_at_time = [s] (const mutation& m, gc_clock::time_point t) {
+            auto slice = partition_slice_builder(*s)
+                    .with_regular_column("v1")
+                    .with_regular_column("v2")
+                    .with_regular_column("v3")
+                    .with_static_column("s1")
+                    .with_static_column("s2")
+                    .with_static_column("s3")
+                    .without_clustering_key_columns()
+                    .without_partition_key_columns()
+                    .build();
+            return query::result_set::from_raw_result(s, slice, m.query(slice, query::result_request::result_and_digest, t));
+        };
+
+        {
+            mutation m(pk, s);
+            m.set_clustered_cell(ckey1, *s->get_column_definition("v1"), atomic_cell::make_live(api::new_timestamp(), v1.serialize(), t1, ttl));
+            m.set_clustered_cell(ckey1, *s->get_column_definition("v2"), atomic_cell::make_live(api::new_timestamp(), v2.serialize(), t2, ttl));
+            m.set_clustered_cell(ckey1, *s->get_column_definition("v3"), atomic_cell::make_live(api::new_timestamp(), v3.serialize(), t3, ttl));
+            m.set_static_cell(*s->get_column_definition("s1"), atomic_cell::make_live(api::new_timestamp(), v1.serialize(), t1, ttl));
+            m.set_static_cell(*s->get_column_definition("s2"), atomic_cell::make_live(api::new_timestamp(), v2.serialize(), t2, ttl));
+            m.set_static_cell(*s->get_column_definition("s3"), atomic_cell::make_live(api::new_timestamp(), v3.serialize(), t3, ttl));
+
+            assert_that(results_at_time(m, t1))
+                    .has_only(a_row()
+                         .with_column("s1", v1)
+                         .with_column("s2", v2)
+                         .with_column("s3", v3)
+                         .with_column("v1", v1)
+                         .with_column("v2", v2)
+                         .with_column("v3", v3)
+                         .and_only_that());
+
+            assert_that(results_at_time(m, t2))
+                    .has_only(a_row()
+                         .with_column("s2", v2)
+                         .with_column("s3", v3)
+                         .with_column("v2", v2)
+                         .with_column("v3", v3)
+                         .and_only_that());
+
+            assert_that(results_at_time(m, t3))
+                    .has_only(a_row()
+                         .with_column("s3", v3)
+                         .with_column("v3", v3)
+                         .and_only_that());
+
+            assert_that(results_at_time(m, t4)).is_empty();
+        }
+
+        {
+            mutation m(pk, s);
+            m.set_clustered_cell(ckey1, *s->get_column_definition("v1"), atomic_cell::make_live(api::new_timestamp(), v1.serialize(), t1, ttl));
+            m.set_static_cell(*s->get_column_definition("s1"), atomic_cell::make_live(api::new_timestamp(), v1.serialize(), t3, ttl));
+
+            assert_that(results_at_time(m, t2))
+                    .has_only(a_row().with_column("s1", v1).and_only_that());
+
+            assert_that(results_at_time(m, t4)).is_empty();
+        }
+    });
+}
+
 SEASTAR_TEST_CASE(test_tombstone_purge) {
    auto builder = schema_builder("tests", "tombstone_purge")
        .with_column("id", utf8_type, column_kind::partition_key)
--- a/tests/perf/perf_simple_query.cc
+++ b/tests/perf/perf_simple_query.cc
@@ -50,6 +50,7 @@ struct test_config {
    unsigned partitions;
    unsigned concurrency;
    bool query_single_key;
+    unsigned duration_in_seconds;
 };

 std::ostream& operator<<(std::ostream& os, const test_config::run_mode& m) {
@@ -79,7 +80,7 @@ future<> test_read(cql_test_env& env, test_config& cfg) {
        return time_parallel([&env, &cfg, id] {
            bytes key = make_key(cfg.query_single_key ? 0 : std::rand() % cfg.partitions);
            return env.execute_prepared(id, {{std::move(key)}}).discard_result();
-        }, cfg.concurrency);
+        }, cfg.concurrency, cfg.duration_in_seconds);
    });
 }

@@ -95,7 +96,7 @@ future<> test_write(cql_test_env& env, test_config& cfg) {
            return time_parallel([&env, &cfg, id] {
                bytes key = make_key(cfg.query_single_key ? 0 : std::rand() % cfg.partitions);
                return env.execute_prepared(id, {{std::move(key)}}).discard_result();
-            }, cfg.concurrency);
+            }, cfg.concurrency, cfg.duration_in_seconds);
        });
 }

@@ -125,6 +126,7 @@ int main(int argc, char** argv) {
    app.add_options()
        ("partitions", bpo::value<unsigned>()->default_value(10000), "number of partitions")
        ("write", "test write path instead of read path")
+        ("duration", bpo::value<unsigned>()->default_value(5), "test duration in seconds")
        ("query-single-key", "test write path instead of read path")
        ("concurrency", bpo::value<unsigned>()->default_value(100), "workers per core");

@@ -132,6 +134,7 @@ int main(int argc, char** argv) {
        make_env_for_test().then([&app] (auto env) {
            auto cfg = make_lw_shared<test_config>();
            cfg->partitions = app.configuration()["partitions"].as<unsigned>();
+            cfg->duration_in_seconds = app.configuration()["duration"].as<unsigned>();
            cfg->concurrency = app.configuration()["concurrency"].as<unsigned>();
            cfg->mode = app.configuration().count("write") ? test_config::run_mode::write : test_config::run_mode::read;
            cfg->query_single_key = app.configuration().count("query-single-key");
--- a/tests/result_set_assertions.cc
+++ b/tests/result_set_assertions.cc
@@ -49,6 +49,14 @@ row_assertion::matches(const query::result_set_row& row) const {
            }
        }
    }
+    if (_only_that) {
+        for (auto&& e : row.cells()) {
+            auto name = to_bytes(e.first);
+            if (!_expected_values.count(name)) {
+                return false;
+            }
+        }
+    }
    return true;
 }

--- a/tests/result_set_assertions.hh
+++ b/tests/result_set_assertions.hh
@@ -36,11 +36,16 @@

 class row_assertion {
    std::map<bytes, data_value> _expected_values;
+    bool _only_that = false;
 public:
    row_assertion& with_column(bytes name, data_value value) {
        _expected_values.emplace(name, value);
        return *this;
    }
+    row_assertion& and_only_that() {
+        _only_that = true;
+        return *this;
+    }
 private:
    friend class result_set_assertions;
    bool matches(const query::result_set_row& row) const;
--- a/tests/row_cache_alloc_stress.cc
+++ b/tests/row_cache_alloc_stress.cc
@@ -106,13 +106,17 @@ int main(int argc, char** argv) {
                keys.push_back(key);
            }

+            auto reclaimable_memory = [] {
+                return memory::stats().free_memory() + logalloc::shard_tracker().occupancy().free_space();
+            };
+
            std::cout << "memtable occupancy: " << mt->occupancy() << "\n";
            std::cout << "Cache occupancy: " << tracker.region().occupancy() << "\n";
-            std::cout << "Free memory: " << memory::stats().free_memory() << "\n";
+            std::cout << "Reclaimable memory: " << reclaimable_memory() << "\n";

            // We need to have enough Free memory to copy memtable into cache
            // When this assertion fails, increase amount of memory
-            assert(mt->occupancy().used_space() < memory::stats().free_memory());
+            assert(mt->occupancy().used_space() < reclaimable_memory());

            auto checker = [](const partition_key& key) {
                return partition_presence_checker_result::maybe_exists;
@@ -146,13 +150,14 @@ int main(int argc, char** argv) {
                for (auto&& key : keys) {
                    cache.touch(key);
                }
-                std::cout << "Free memory: " << memory::stats().free_memory() << "\n";
+                std::cout << "Reclaimable memory: " << reclaimable_memory() << "\n";
                std::cout << "Cache occupancy: " << tracker.region().occupancy() << "\n";
            };

            std::deque<std::unique_ptr<char[]>> stuffing;
            auto fragment_free_space = [&] {
                stuffing.clear();
+                std::cout << "Reclaimable memory: " << reclaimable_memory() << "\n";
                std::cout << "Free memory: " << memory::stats().free_memory() << "\n";
                std::cout << "Cache occupancy: " << tracker.region().occupancy() << "\n";

@@ -165,6 +170,7 @@ int main(int argc, char** argv) {
                }

                std::cout << "After fragmenting:\n";
+                std::cout << "Reclaimable memory: " << reclaimable_memory() << "\n";
                std::cout << "Free memory: " << memory::stats().free_memory() << "\n";
                std::cout << "Cache occupancy: " << tracker.region().occupancy() << "\n";
            };
--- a/tests/sstable_datafile_test.cc
+++ b/tests/sstable_datafile_test.cc
@@ -1031,7 +1031,7 @@ SEASTAR_TEST_CASE(compaction_manager_test) {
        }).then([cf, cm] {
            // remove cf from compaction manager; this will wait for the
            // ongoing compaction to finish.
-            return cm->remove(&*cf).then([cf, cm] {
+            return cf->stop().then([cf, cm] {
                // expect sstables of cf to be compacted.
                BOOST_REQUIRE(cf->sstables_count() == 1);
                // stop all compaction manager tasks.
--- a/tests/sstable_mutation_test.cc
+++ b/tests/sstable_mutation_test.cc
@@ -474,3 +474,109 @@ SEASTAR_TEST_CASE(broken_ranges_collection) {
        });
    });
 }
+
+// Scylla does not currently support generic range-tombstone - only ranges
+// which are a complete clustering-key prefix are supported because our
+// row_tombstone only works on whole rows. This is good enough because
+// in Cassandra 2 (whose sstables we support) there is no way using CQL to
+// create a generic range, because the DELETE and UPDATE statement's "WHERE"
+// only takes the "=" operator, leading to a deletion of entire rows.
+//
+// However, in one imporant case the sstable written by Cassandra might look
+// like it has generic range tombstone: consider two overlapping tombstones,
+// one deleting a bigger prefix than the other:
+//
+//     create COLUMNFAMILY tab (pk text, ck1 text, ck2 text, data text, primary key(pk, ck1, ck2));
+//     delete from tab where pk = 'pk' and ck1 = 'aaa';
+//     delete from tab where pk = 'pk' and ck1 = 'aaa' and ck2 = 'bbb';
+//
+// The first deletion covers the second, but nevertheless we cannot drop the
+// smaller one because the two deletions have different timestamps. But while
+// it is not allowed to drop the smaller deletion, it is possible to split the
+// the larger range to three ranges where one of them is the the smaller range
+// and then we have two range tombstones with identical ranges - and can keep
+// only the newer one. This splitting is what Cassandra does: Cassandra does
+// not want to have overlapping range tombstones, so it converts them (see
+// RangeTombstoneList.java) into non-overlapping range-tombstones, as describe
+// above. In the above example, the resulting sstable is (sstable2json format)
+//
+//     {"key": "pk",
+//      "cells": [["aaa:_","aaa:bbb:_",1459334681228103,"t",1459334681],
+//                ["aaa:bbb:_","aaa:bbb:!",1459334681244989,"t",1459334681],
+//                ["aaa:bbb:!","aaa:!",1459334681228103,"t",1459334681]]}
+//               ]
+//
+// Note that the middle tombstone has a different timestamp than the other.
+//
+// In this sstable, the first and third tombstones look like "generic" ranges,
+// not covering an entire prefix, so we cannot represent these three
+// tombstones in our in-memory data structure. Instead, we need to convert the
+// three non-overlapping tombstones to two overlapping whole-prefix tombstones,
+// the two we started with.
+// That is what this test tests - we read an sstable as above and verify that
+// our sstable reading code converted it to two overlapping tombstones.
+
+static schema_ptr tombstone_overlap_schema() {
+    static thread_local auto s = [] {
+        schema_builder builder(make_lw_shared(schema(generate_legacy_id("try1", "tab"), "try1", "tab",
+        // partition key
+        {{"pk", utf8_type}},
+        // clustering key
+        {{"ck1", utf8_type}, {"ck2", utf8_type}},
+        // regular columns
+        {},
+        // static columns
+        {},
+        // regular column name type
+        utf8_type,
+        // comment
+        ""
+       )));
+       return builder.build(schema_builder::compact_storage::no);
+    }();
+    return s;
+}
+
+
+static future<sstable_ptr> ka_sst(sstring ks, sstring cf, sstring dir, unsigned long generation) {
+    auto sst = make_lw_shared<sstable>(ks, cf, dir, generation, sstables::sstable::version_types::ka, big);
+    auto fut = sst->load();
+    return std::move(fut).then([sst = std::move(sst)] {
+        return make_ready_future<sstable_ptr>(std::move(sst));
+    });
+}
+
+SEASTAR_TEST_CASE(tombstone_in_tombstone) {
+    return ka_sst("try1", "tab", "tests/sstables/tombstone_overlap", 1).then([] (auto sstp) {
+        auto s = tombstone_overlap_schema();
+        return do_with(sstp->read_rows(s), [sstp, s] (auto& reader) {
+            return repeat([sstp, s, &reader] {
+                return reader.read().then([s] (mutation_opt mut) {
+                    if (!mut) {
+                        return stop_iteration::yes;
+                    }
+                    BOOST_REQUIRE((bytes_view(mut->key()) == bytes{'\x00','\x02','p','k'}));
+                    // We expect to see two overlapping deletions, as explained
+                    // above. Somewhat counterintuitively, scylla represents
+                    // deleting a small row with all clustering keys set - not
+                    // as a "row tombstone" but rather as a deleted clustering row.
+                    // So we expect to see one row tombstone and one deleted row.
+                    auto& rts = mut->partition().row_tombstones();
+                    BOOST_REQUIRE(rts.size() == 1);
+                    for (auto e : rts) {
+                        BOOST_REQUIRE((bytes_view(e.prefix()) == bytes{'\x00','\x03','a','a','a'}));
+                        BOOST_REQUIRE(e.t().timestamp == 1459334681228103LL);
+                    }
+                    auto& rows = mut->partition().clustered_rows();
+                    BOOST_REQUIRE(rows.size() == 1);
+                    for (auto e : rows) {
+                        BOOST_REQUIRE((bytes_view(e.key()) == bytes{'\x00','\x03','a','a','a', '\x00', '\x03', 'b', 'b', 'b'}));
+                        BOOST_REQUIRE(e.row().deleted_at().timestamp == 1459334681244989LL);
+                    }
+
+                    return stop_iteration::no;
+                });
+            });
+        });
+    });
+}
--- a/tests/sstable_test.cc
+++ b/tests/sstable_test.cc
@@ -170,6 +170,33 @@ SEASTAR_TEST_CASE(big_summary_query_32) {
    return summary_query<32, 0xc4000, 182>("tests/sstables/bigsummary", 76);
 }

+// The following two files are just a copy of uncompressed's 1. But the Summary
+// is removed (and removed from the TOC as well). We should reconstruct it
+// in this case, so the queries should still go through
+SEASTAR_TEST_CASE(missing_summary_query_ok) {
+    return summary_query<0, 0, 5>("tests/sstables/uncompressed", 2);
+}
+
+SEASTAR_TEST_CASE(missing_summary_query_fail) {
+    return summary_query_fail<2, 0, 5>("tests/sstables/uncompressed", 2);
+}
+
+SEASTAR_TEST_CASE(missing_summary_query_negative_fail) {
+    return summary_query_fail<-2, 0, 5>("tests/sstables/uncompressed", 2);
+}
+
+SEASTAR_TEST_CASE(missing_summary_first_last_sane) {
+    return reusable_sst("tests/sstables/uncompressed", 2).then([] (sstable_ptr ptr) {
+        auto& summary = sstables::test(ptr).get_summary();
+        BOOST_REQUIRE(summary.header.size == 1);
+        BOOST_REQUIRE(summary.positions.size() == 1);
+        BOOST_REQUIRE(summary.entries.size() == 1);
+        BOOST_REQUIRE(bytes_view(summary.first_key) == as_bytes("vinna"));
+        BOOST_REQUIRE(bytes_view(summary.last_key) == as_bytes("finna"));
+        return make_ready_future<>();
+    });
+}
+
 static future<sstable_ptr> do_write_sst(sstring load_dir, sstring write_dir, unsigned long generation) {
    auto sst = make_lw_shared<sstable>("ks", "cf", load_dir, generation, la, big);
    return sst->load().then([sst, write_dir, generation] {
@@ -864,16 +891,17 @@ SEASTAR_TEST_CASE(reshuffle) {
            auto cf = make_lw_shared<column_family>(uncompressed_schema(), cfg, column_family::no_commitlog(), *cm);
            cf->start();
            cf->mark_ready_for_writes();
-            return cf->reshuffle_sstables(3).then([cm, cf] (std::vector<sstables::entry_descriptor> reshuffled) {
-                BOOST_REQUIRE(reshuffled.size() == 2);
-                BOOST_REQUIRE(reshuffled[0].generation  == 3);
-                BOOST_REQUIRE(reshuffled[1].generation  == 4);
+            std::set<int64_t> existing_sstables = { 1, 5 };
+            return cf->reshuffle_sstables(existing_sstables, 6).then([cm, cf] (std::vector<sstables::entry_descriptor> reshuffled) {
+                BOOST_REQUIRE(reshuffled.size() == 1);
+                BOOST_REQUIRE(reshuffled[0].generation  == 6);
                return when_all(
                    test_sstable_exists("tests/sstables/generation", 1, true),
                    test_sstable_exists("tests/sstables/generation", 2, false),
-                    test_sstable_exists("tests/sstables/generation", 3, true),
-                    test_sstable_exists("tests/sstables/generation", 4, true),
-                    test_sstable_exists("tests/sstables/generation", 5, false),
+                    test_sstable_exists("tests/sstables/generation", 3, false),
+                    test_sstable_exists("tests/sstables/generation", 4, false),
+                    test_sstable_exists("tests/sstables/generation", 5, true),
+                    test_sstable_exists("tests/sstables/generation", 6, true),
                    test_sstable_exists("tests/sstables/generation", 10, false)
                ).discard_result().then([cm] {
                    return cm->stop();
--- a/tests/sstables/tombstone_overlap/try1-tab-ka-1-CompressionInfo.db
+++ b/tests/sstables/tombstone_overlap/try1-tab-ka-1-CompressionInfo.db
--- a/tests/sstables/tombstone_overlap/try1-tab-ka-1-Data.db
+++ b/tests/sstables/tombstone_overlap/try1-tab-ka-1-Data.db
--- a/tests/sstables/tombstone_overlap/try1-tab-ka-1-Digest.sha1
+++ b/tests/sstables/tombstone_overlap/try1-tab-ka-1-Digest.sha1
@@ -0,0 +1 @@
+4178122188
--- a/tests/sstables/tombstone_overlap/try1-tab-ka-1-Filter.db
+++ b/tests/sstables/tombstone_overlap/try1-tab-ka-1-Filter.db
--- a/tests/sstables/tombstone_overlap/try1-tab-ka-1-Index.db
+++ b/tests/sstables/tombstone_overlap/try1-tab-ka-1-Index.db
--- a/tests/sstables/tombstone_overlap/try1-tab-ka-1-Statistics.db
+++ b/tests/sstables/tombstone_overlap/try1-tab-ka-1-Statistics.db
--- a/tests/sstables/tombstone_overlap/try1-tab-ka-1-Summary.db
+++ b/tests/sstables/tombstone_overlap/try1-tab-ka-1-Summary.db
--- a/tests/sstables/tombstone_overlap/try1-tab-ka-1-TOC.txt
+++ b/tests/sstables/tombstone_overlap/try1-tab-ka-1-TOC.txt
@@ -0,0 +1,8 @@
+TOC.txt
+Filter.db
+CompressionInfo.db
+Index.db
+Digest.sha1
+Summary.db
+Data.db
+Statistics.db
--- a/tests/sstables/uncompressed/la-2-big-CRC.db
+++ b/tests/sstables/uncompressed/la-2-big-CRC.db
--- a/tests/sstables/uncompressed/la-2-big-Data.db
+++ b/tests/sstables/uncompressed/la-2-big-Data.db
--- a/tests/sstables/uncompressed/la-2-big-Digest.sha1
+++ b/tests/sstables/uncompressed/la-2-big-Digest.sha1
@@ -0,0 +1 @@
+748507322
--- a/tests/sstables/uncompressed/la-2-big-Filter.db
+++ b/tests/sstables/uncompressed/la-2-big-Filter.db
--- a/tests/sstables/uncompressed/la-2-big-Index.db
+++ b/tests/sstables/uncompressed/la-2-big-Index.db
--- a/tests/sstables/uncompressed/la-2-big-Statistics.db
+++ b/tests/sstables/uncompressed/la-2-big-Statistics.db
--- a/tests/sstables/uncompressed/la-2-big-TOC.txt
+++ b/tests/sstables/uncompressed/la-2-big-TOC.txt
@@ -0,0 +1,7 @@
+Data.db
+Filter.db
+CRC.db
+Statistics.db
+Digest.sha1
+Index.db
+TOC.txt
--- a/tests/storage_proxy_test.cc
+++ b/tests/storage_proxy_test.cc
@@ -40,8 +40,7 @@ static query::result to_data_query_result(mutation_reader& reader, const query::
        if (!mo) {
            break;
        }
-        auto pb = builder.add_partition(*mo->schema(), mo->key());
-        mo->partition().query(pb, *mo->schema(), now);
+        std::move(*mo).query(builder, slice, now);
    }
    return builder.build();
 }
--- a/Show More
+++ b/Show More