sstable: fix use-after-free of temporary ioclass copy

Commit 6a3872b355 fixed some use-after-free bugs but introduced a new one because of a typo: Instead of capturing a reference to the long-living io-class object, as all the code does, one place in the code accidentally captured a *copy* of this object. This copy had a very temporary life, and when a reference to that *copy* was passed to sstable reading code which assumed that it lives at least as long as the read call, a use-after-free resulted. Fixes #1072 Signed-off-by: Nadav Har'El <nyh@scylladb.com> Message-Id: <1458595629-9314-1-git-send-email-nyh@scylladb.com> (cherry picked from commit 2eb0627665)
gossip: Handle unknown application_state when printing
2016-03-22 08:11:00 +02:00 · 2016-03-21 11:59:53 +02:00 · 2016-03-18 09:20:45 +02:00 · 2016-03-18 09:00:18 +02:00 · 2016-03-18 08:57:52 +02:00 · 2016-03-18 08:57:42 +02:00
139 changed files with 1260 additions and 4393 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../scylla-seastar
+	url = ../seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=1.0.4
+VERSION=0.19

 if test -f version
 then
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -588,8 +588,6 @@ void set_storage_service(http_context& ctx, routes& r) {
        auto val_str = req->get_query_param("value");
        bool value = (val_str == "True") || (val_str == "true") || (val_str == "1");
        return service::get_local_storage_service().db().invoke_on_all([value] (database& db) {
-            db.set_enable_incremental_backups(value);
-
            // Change both KS and CF, so they are in sync
            for (auto& pair: db.get_keyspaces()) {
                auto& ks = pair.second;
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -54,9 +54,9 @@ class atomic_cell_or_collection;
 */
 class atomic_cell_type final {
 private:
+    static constexpr int8_t DEAD_FLAGS = 0;
    static constexpr int8_t LIVE_FLAG = 0x01;
    static constexpr int8_t EXPIRY_FLAG = 0x02; // When present, expiry field is present. Set only for live cells
-    static constexpr int8_t REVERT_FLAG = 0x04; // transient flag used to efficiently implement ReversiblyMergeable for atomic cells.
    static constexpr unsigned flags_size = 1;
    static constexpr unsigned timestamp_offset = flags_size;
    static constexpr unsigned timestamp_size = 8;
@@ -67,21 +67,14 @@ private:
    static constexpr unsigned ttl_offset = expiry_offset + expiry_size;
    static constexpr unsigned ttl_size = 4;
 private:
-    static bool is_revert_set(bytes_view cell) {
-        return cell[0] & REVERT_FLAG;
-    }
-    template<typename BytesContainer>
-    static void set_revert(BytesContainer& cell, bool revert) {
-        cell[0] = (cell[0] & ~REVERT_FLAG) | (revert * REVERT_FLAG);
-    }
    static bool is_live(const bytes_view& cell) {
-        return cell[0] & LIVE_FLAG;
+        return cell[0] != DEAD_FLAGS;
    }
    static bool is_live_and_has_ttl(const bytes_view& cell) {
        return cell[0] & EXPIRY_FLAG;
    }
    static bool is_dead(const bytes_view& cell) {
-        return !is_live(cell);
+        return cell[0] == DEAD_FLAGS;
    }
    // Can be called on live and dead cells
    static api::timestamp_type timestamp(const bytes_view& cell) {
@@ -113,7 +106,7 @@ private:
    }
    static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
        managed_bytes b(managed_bytes::initialized_later(), flags_size + timestamp_size + deletion_time_size);
-        b[0] = 0;
+        b[0] = DEAD_FLAGS;
        set_field(b, timestamp_offset, timestamp);
        set_field(b, deletion_time_offset, deletion_time.time_since_epoch().count());
        return b;
@@ -147,11 +140,8 @@ protected:
    ByteContainer _data;
 protected:
    atomic_cell_base(ByteContainer&& data) : _data(std::forward<ByteContainer>(data)) { }
-    friend class atomic_cell_or_collection;
+    atomic_cell_base(const ByteContainer& data) : _data(data) { }
 public:
-    bool is_revert_set() const {
-        return atomic_cell_type::is_revert_set(_data);
-    }
    bool is_live() const {
        return atomic_cell_type::is_live(_data);
    }
@@ -197,13 +187,10 @@ public:
    bytes_view serialize() const {
        return _data;
    }
-    void set_revert(bool revert) {
-        atomic_cell_type::set_revert(_data, revert);
-    }
 };

 class atomic_cell_view final : public atomic_cell_base<bytes_view> {
-    atomic_cell_view(bytes_view data) : atomic_cell_base(std::move(data)) {}
+    atomic_cell_view(bytes_view data) : atomic_cell_base(data) {}
 public:
    static atomic_cell_view from_bytes(bytes_view data) { return atomic_cell_view(data); }

@@ -211,11 +198,6 @@ public:
    friend std::ostream& operator<<(std::ostream& os, const atomic_cell_view& acv);
 };

-class atomic_cell_ref final : public atomic_cell_base<managed_bytes&> {
-public:
-    atomic_cell_ref(managed_bytes& buf) : atomic_cell_base(buf) {}
-};
-
 class atomic_cell final : public atomic_cell_base<managed_bytes> {
    atomic_cell(managed_bytes b) : atomic_cell_base(std::move(b)) {}
 public:
--- a/atomic_cell_hash.hh
+++ b/atomic_cell_hash.hh
@@ -27,18 +27,16 @@
 #include "atomic_cell.hh"
 #include "hashing.hh"

-template<>
-struct appending_hash<collection_mutation_view> {
-    template<typename Hasher>
-    void operator()(Hasher& h, collection_mutation_view cell) const {
-        auto m_view = collection_type_impl::deserialize_mutation_form(cell);
-        ::feed_hash(h, m_view.tomb);
-        for (auto&& key_and_value : m_view.cells) {
-            ::feed_hash(h, key_and_value.first);
-            ::feed_hash(h, key_and_value.second);
-        }
+template<typename Hasher>
+void feed_hash(collection_mutation_view cell, Hasher& h, const data_type& type) {
+    auto&& ctype = static_pointer_cast<const collection_type_impl>(type);
+    auto m_view = ctype->deserialize_mutation_form(cell);
+    ::feed_hash(h, m_view.tomb);
+    for (auto&& key_and_value : m_view.cells) {
+        ::feed_hash(h, key_and_value.first);
+        ::feed_hash(h, key_and_value.second);
    }
-};
+}

 template<>
 struct appending_hash<atomic_cell_view> {
@@ -57,19 +55,3 @@ struct appending_hash<atomic_cell_view> {
        }
    }
 };
-
-template<>
-struct appending_hash<atomic_cell> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const atomic_cell& cell) const {
-        feed_hash(h, static_cast<atomic_cell_view>(cell));
-    }
-};
-
-template<>
-struct appending_hash<collection_mutation> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const collection_mutation& cm) const {
-        feed_hash(h, static_cast<collection_mutation_view>(cm));
-    }
-};
--- a/atomic_cell_or_collection.hh
+++ b/atomic_cell_or_collection.hh
@@ -27,8 +27,6 @@

 // A variant type that can hold either an atomic_cell, or a serialized collection.
 // Which type is stored is determined by the schema.
-// Has an "empty" state.
-// Objects moved-from are left in an empty state.
 class atomic_cell_or_collection final {
    managed_bytes _data;
 private:
@@ -38,7 +36,6 @@ public:
    atomic_cell_or_collection(atomic_cell ac) : _data(std::move(ac._data)) {}
    static atomic_cell_or_collection from_atomic_cell(atomic_cell data) { return { std::move(data._data) }; }
    atomic_cell_view as_atomic_cell() const { return atomic_cell_view::from_bytes(_data); }
-    atomic_cell_ref as_atomic_cell_ref() { return { _data }; }
    atomic_cell_or_collection(collection_mutation cm) : _data(std::move(cm.data)) {}
    explicit operator bool() const {
        return !_data.empty();
--- a/configure.py
+++ b/configure.py
@@ -162,7 +162,6 @@ modes = {

 scylla_tests = [
    'tests/mutation_test',
-    'tests/schema_registry_test',
    'tests/canonical_mutation_test',
    'tests/range_test',
    'tests/types_test',
@@ -265,6 +264,7 @@ add_tristate(arg_parser, name = 'xen', dest = 'xen', help = 'Xen support')
 args = arg_parser.parse_args()

 defines = []
+scylla_libs = '-llz4 -lsnappy -lz -lboost_thread -lcryptopp -lrt -lyaml-cpp -lboost_date_time'

 extra_cxxflags = {}

@@ -698,7 +698,7 @@ for mode in build_modes:
 seastar_deps = 'practically_anything_can_change_so_lets_run_it_every_time_and_restat.'

 args.user_cflags += " " + pkg_config("--cflags", "jsoncpp")
-libs = "-lyaml-cpp -llz4 -lz -lsnappy " + pkg_config("--libs", "jsoncpp") + ' -lboost_filesystem' + ' -lcrypt' + ' -lboost_date_time'
+libs = "-lyaml-cpp -llz4 -lz -lsnappy " + pkg_config("--libs", "jsoncpp") + ' -lboost_filesystem' + ' -lcrypt'
 for pkg in pkgs:
    args.user_cflags += ' ' + pkg_config('--cflags', pkg)
    libs += ' ' + pkg_config('--libs', pkg)
@@ -846,8 +846,7 @@ with open(buildfile, 'w') as f:
        for obj in compiles:
            src = compiles[obj]
            gen_headers = list(ragels.keys())
-            gen_headers += ['seastar/build/{}/gen/http/request_parser.hh'.format(mode)]
-            gen_headers += ['seastar/build/{}/gen/http/http_response_parser.hh'.format(mode)]
+            gen_headers += ['seastar/build/{}/http/request_parser.hh'.format(mode)]
            for th in thrifts:
                gen_headers += th.headers('$builddir/{}/gen'.format(mode))
            for g in antlr3_grammars:
@@ -879,10 +878,10 @@ with open(buildfile, 'w') as f:
            for cc in grammar.sources('$builddir/{}/gen'.format(mode)):
                obj = cc.replace('.cpp', '.o')
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
-        f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune seastar/build/{mode}/gen/http/request_parser.hh seastar/build/{mode}/gen/http/http_response_parser.hh: ninja {seastar_deps}\n'
+        f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune: ninja {seastar_deps}\n'
                .format(**locals()))
        f.write('  subdir = seastar\n')
-        f.write('  target = build/{mode}/libseastar.a build/{mode}/apps/iotune/iotune build/{mode}/gen/http/request_parser.hh build/{mode}/gen/http/http_response_parser.hh\n'.format(**locals()))
+        f.write('  target = build/{mode}/libseastar.a build/{mode}/apps/iotune/iotune\n'.format(**locals()))
        f.write(textwrap.dedent('''\
            build build/{mode}/iotune: copy seastar/build/{mode}/apps/iotune/iotune
            ''').format(**locals()))
@@ -896,6 +895,10 @@ with open(buildfile, 'w') as f:
            command = find -name '*.[chS]' -o -name "*.cc" -o -name "*.hh" | cscope -bq -i-
            description = CSCOPE
        build cscope: cscope
+        rule request_parser_hh
+           command = {ninja} -C seastar build/release/gen/http/request_parser.hh build/debug/gen/http/request_parser.hh
+           description = GEN seastar/http/request_parser.hh
+        build seastar/build/release/http/request_parser.hh seastar/build/debug/http/request_parser.hh: request_parser_hh
        rule clean
            command = rm -rf build
            description = CLEAN
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -423,9 +423,10 @@ void query_processor::migration_subscriber::on_update_keyspace(const sstring& ks

 void query_processor::migration_subscriber::on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool columns_changed)
 {
-    // #1255: Ignoring columns_changed deliberately.
-    log.info("Column definitions for {}.{} changed, invalidating related prepared statements", ks_name, cf_name);
-    remove_invalid_prepared_statements(ks_name, cf_name);
+    if (columns_changed) {
+        log.info("Column definitions for {}.{} changed, invalidating related prepared statements", ks_name, cf_name);
+        remove_invalid_prepared_statements(ks_name, cf_name);
+    }
 }

 void query_processor::migration_subscriber::on_update_user_type(const sstring& ks_name, const sstring& type_name)
--- a/cql3/selection/selectable.cc
+++ b/cql3/selection/selectable.cc
@@ -52,11 +52,6 @@ selectable::writetime_or_ttl::new_selector_factory(database& db, schema_ptr s, s
    return writetime_or_ttl_selector::new_factory(def->name_as_text(), add_and_get_index(*def, defs), _is_writetime);
 }

-sstring
-selectable::writetime_or_ttl::to_string() const {
-    return sprint("%s(%s)", _is_writetime ? "writetime" : "ttl", _id->to_string());
-}
-
 shared_ptr<selectable>
 selectable::writetime_or_ttl::raw::prepare(schema_ptr s) {
    return make_shared<writetime_or_ttl>(_id->prepare_column_identifier(s), _is_writetime);
@@ -83,11 +78,6 @@ selectable::with_function::new_selector_factory(database& db, schema_ptr s, std:
    return abstract_function_selector::new_factory(std::move(fun), std::move(factories));
 }

-sstring
-selectable::with_function::to_string() const {
-    return sprint("%s(%s)", _function_name.name, join(", ", _args));
-}
-
 shared_ptr<selectable>
 selectable::with_function::raw::prepare(schema_ptr s) {
        std::vector<shared_ptr<selectable>> prepared_args;
@@ -111,7 +101,7 @@ selectable::with_field_selection::new_selector_factory(database& db, schema_ptr
    if (!ut) {
        throw exceptions::invalid_request_exception(
                sprint("Invalid field selection: %s of type %s is not a user type",
-                       _selected->to_string(), factory->new_instance()->get_type()->as_cql3_type()));
+                       "FIXME: selectable" /* FIMXME: _selected */, ut->as_cql3_type()));
    }
    for (size_t i = 0; i < ut->size(); ++i) {
        if (ut->field_name(i) != _field->bytes_) {
@@ -120,12 +110,7 @@ selectable::with_field_selection::new_selector_factory(database& db, schema_ptr
        return field_selector::new_factory(std::move(ut), i, std::move(factory));
    }
    throw exceptions::invalid_request_exception(sprint("%s of type %s has no field %s",
-                                                       _selected->to_string(), ut->as_cql3_type(), _field));
-}
-
-sstring
-selectable::with_field_selection::to_string() const {
-    return sprint("%s.%s", _selected->to_string(), _field->to_string());
+                                                       "FIXME: selectable" /* FIXME: _selected */, ut->as_cql3_type(), _field));
 }

 shared_ptr<selectable>
@@ -141,10 +126,6 @@ selectable::with_field_selection::raw::processes_selection() const {
    return true;
 }

-std::ostream & operator<<(std::ostream &os, const selectable& s) {
-    return os << s.to_string();
-}
-
 }

 }
--- a/cql3/selection/selectable.hh
+++ b/cql3/selection/selectable.hh
@@ -55,7 +55,6 @@ class selectable {
 public:
    virtual ~selectable() {}
    virtual ::shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr schema, std::vector<const column_definition*>& defs) = 0;
-    virtual sstring to_string() const = 0;
 protected:
    static size_t add_and_get_index(const column_definition& def, std::vector<const column_definition*>& defs) {
        auto i = std::find(defs.begin(), defs.end(), &def);
@@ -85,8 +84,6 @@ public:
    class with_field_selection;
 };

-std::ostream & operator<<(std::ostream &os, const selectable& s);
-
 class selectable::with_function : public selectable {
    functions::function_name _function_name;
    std::vector<shared_ptr<selectable>> _args;
@@ -95,7 +92,17 @@ public:
        : _function_name(std::move(fname)), _args(std::move(args)) {
    }

-    virtual sstring to_string() const override;
+#if 0
+    @Override
+    public String toString()
+    {
+        return new StrBuilder().append(functionName)
+                               .append("(")
+                               .appendWithSeparators(args, ", ")
+                               .append(")")
+                               .toString();
+    }
+#endif

    virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;
    class raw : public selectable::raw {
--- a/cql3/selection/selectable_with_field_selection.hh
+++ b/cql3/selection/selectable_with_field_selection.hh
@@ -59,7 +59,13 @@ public:
            : _selected(std::move(selected)), _field(std::move(field)) {
    }

-    virtual sstring to_string() const override;
+#if 0
+    @Override
+    public String toString()
+    {
+        return String.format("%s.%s", selected, field);
+    }
+#endif

    virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;

--- a/cql3/selection/writetime_or_ttl.hh
+++ b/cql3/selection/writetime_or_ttl.hh
@@ -58,7 +58,13 @@ public:
            : _id(std::move(id)), _is_writetime(is_writetime) {
    }

-    virtual sstring to_string() const override;
+#if 0
+    @Override
+    public String toString()
+    {
+        return (isWritetime ? "writetime" : "ttl") + "(" + id + ")";
+    }
+#endif

    virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;

--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -169,21 +169,26 @@ public:
    }
 private:
    future<std::vector<mutation>> get_mutations(distributed<service::storage_proxy>& storage, const query_options& options, bool local, api::timestamp_type now) {
-        // Do not process in parallel because operations like list append/prepend depend on execution order.
-        return do_with(std::vector<mutation>(), [this, &storage, &options, now, local] (auto&& result) {
-            return do_for_each(boost::make_counting_iterator<size_t>(0),
-                               boost::make_counting_iterator<size_t>(_statements.size()),
-                               [this, &storage, &options, now, local, &result] (size_t i) {
-                auto&& statement = _statements[i];
-                auto&& statement_options = options.for_statement(i);
-                auto timestamp = _attrs->get_timestamp(now, statement_options);
-                return statement->get_mutations(storage, statement_options, local, timestamp).then([&result] (auto&& more) {
-                    std::move(more.begin(), more.end(), std::back_inserter(result));
-                });
-            }).then([&result] {
-                return std::move(result);
-            });
-        });
+        struct collector {
+            std::vector<mutation> _result;
+            std::vector<mutation> get() && { return std::move(_result); }
+            void operator()(std::vector<mutation> more) {
+                std::move(more.begin(), more.end(), std::back_inserter(_result));
+            }
+        };
+        auto get_mutations_for_statement = [this, &storage, &options, now, local] (size_t i) {
+            auto&& statement = _statements[i];
+            auto&& statement_options = options.for_statement(i);
+            auto timestamp = _attrs->get_timestamp(now, statement_options);
+            return statement->get_mutations(storage, statement_options, local, timestamp);
+        };
+        // FIXME: origin tries hard to merge mutations to same keyspace, for
+        //        some reason.
+        return map_reduce(
+                boost::make_counting_iterator<size_t>(0),
+                boost::make_counting_iterator<size_t>(_statements.size()),
+                get_mutations_for_statement,
+                collector());
    }

 public:
--- a/database.cc
+++ b/database.cc
@@ -45,9 +45,7 @@
 #include <boost/algorithm/cxx11/all_of.hpp>
 #include <boost/function_output_iterator.hpp>
 #include <boost/range/algorithm/heap_algorithm.hpp>
-#include <boost/range/algorithm/remove_if.hpp>
 #include <boost/range/algorithm/find.hpp>
-#include <boost/range/adaptor/map.hpp>
 #include "frozen_mutation.hh"
 #include "mutation_partition_applier.hh"
 #include "core/do_with.hh"
@@ -84,31 +82,17 @@ public:
    }
 };

-lw_shared_ptr<memtable_list>
-column_family::make_memtable_list() {
-    auto seal = [this] { return seal_active_memtable(); };
-    auto get_schema = [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.max_memtable_size, _config.dirty_memory_region_group);
-}
-
-lw_shared_ptr<memtable_list>
-column_family::make_streaming_memtable_list() {
-    auto seal = [this] { return seal_active_streaming_memtable_delayed(); };
-    auto get_schema =  [this] { return schema(); };
-    return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.max_streaming_memtable_size, _config.streaming_dirty_memory_region_group);
-}
-
 column_family::column_family(schema_ptr schema, config config, db::commitlog& cl, compaction_manager& compaction_manager)
    : _schema(std::move(schema))
    , _config(std::move(config))
-    , _memtables(make_memtable_list())
-    , _streaming_memtables(_config.enable_disk_writes ? make_streaming_memtable_list() : make_memtable_list())
+    , _memtables(make_lw_shared(memtable_list{}))
    , _sstables(make_lw_shared<sstable_list>())
    , _cache(_schema, sstables_as_mutation_source(), sstables_as_key_source(), global_cache_tracker())
    , _commitlog(&cl)
    , _compaction_manager(compaction_manager)
    , _flush_queue(std::make_unique<memtable_flush_queue>())
 {
+    add_memtable();
    if (!_config.enable_disk_writes) {
        dblog.warn("Writes disabled, column family no durable.");
    }
@@ -117,14 +101,14 @@ column_family::column_family(schema_ptr schema, config config, db::commitlog& cl
 column_family::column_family(schema_ptr schema, config config, no_commitlog cl, compaction_manager& compaction_manager)
    : _schema(std::move(schema))
    , _config(std::move(config))
-    , _memtables(make_memtable_list())
-    , _streaming_memtables(_config.enable_disk_writes ? make_streaming_memtable_list() : make_memtable_list())
+    , _memtables(make_lw_shared(memtable_list{}))
    , _sstables(make_lw_shared<sstable_list>())
    , _cache(_schema, sstables_as_mutation_source(), sstables_as_key_source(), global_cache_tracker())
    , _commitlog(nullptr)
    , _compaction_manager(compaction_manager)
    , _flush_queue(std::make_unique<memtable_flush_queue>())
 {
+    add_memtable();
    if (!_config.enable_disk_writes) {
        dblog.warn("Writes disabled, column family no durable.");
    }
@@ -156,10 +140,7 @@ column_family::~column_family() {

 logalloc::occupancy_stats column_family::occupancy() const {
    logalloc::occupancy_stats res;
-    for (auto m : *_memtables) {
-        res += m->region().occupancy();
-    }
-    for (auto m : *_streaming_memtables) {
+    for (auto m : *_memtables.get()) {
        res += m->region().occupancy();
    }
    return res;
@@ -502,9 +483,8 @@ future<sstables::entry_descriptor> column_family::probe_file(sstring sstdir, sst
        }
    }

-    auto sst = std::make_unique<sstables::sstable>(_schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
-    auto fut = sst->get_sstable_key_range(*_schema);
-    return std::move(fut).then([this, sst = std::move(sst), sstdir = std::move(sstdir), comps] (range<partition_key> r) mutable {
+    auto fut = sstable::get_sstable_key_range(*_schema, _schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
+    return std::move(fut).then([this, sstdir = std::move(sstdir), comps] (range<partition_key> r) {
        // Checks whether or not sstable belongs to current shard.
        if (!belongs_to_current_shard(*_schema, std::move(r))) {
            dblog.debug("sstable {} not relevant for this shard, ignoring",
@@ -514,6 +494,7 @@ future<sstables::entry_descriptor> column_family::probe_file(sstring sstdir, sst
            return make_ready_future<>();
        }

+        auto sst = std::make_unique<sstables::sstable>(_schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
        auto fut = sst->load();
        return std::move(fut).then([this, sst = std::move(sst)] () mutable {
            add_sstable(std::move(*sst));
@@ -552,6 +533,12 @@ void column_family::add_sstable(lw_shared_ptr<sstables::sstable> sstable) {
    _sstables->emplace(generation, std::move(sstable));
 }

+void column_family::add_memtable() {
+    // allow in-progress reads to continue using old list
+    _memtables = make_lw_shared(memtable_list(*_memtables));
+    _memtables->emplace_back(make_lw_shared<memtable>(_schema, _config.dirty_memory_region_group));
+}
+
 future<>
 column_family::update_cache(memtable& m, lw_shared_ptr<sstable_list> old_sstables) {
    if (_config.enable_cache) {
@@ -563,97 +550,6 @@ column_family::update_cache(memtable& m, lw_shared_ptr<sstable_list> old_sstable
    }
 }

-// FIXME: because we are coalescing, it could be that mutations belonging to the same
-// range end up in two different tables. Technically, we should wait for both. However,
-// the only way we have to make this happen now is to wait on all previous writes. This
-// certainly is an overkill, so we won't do it. We can fix this longer term by looking
-// at the PREPARE messages, and then noting what is the minimum future we should be
-// waiting for.
-future<>
-column_family::seal_active_streaming_memtable_delayed() {
-    auto old = _streaming_memtables->back();
-    if (old->empty()) {
-        return make_ready_future<>();
-    }
-
-    if (_streaming_memtables->should_flush()) {
-        return seal_active_streaming_memtable();
-    }
-
-    if (!_delayed_streaming_flush.armed()) {
-            // We don't want to wait for too long, because the incoming mutations will not be available
-            // until we flush them to SSTables. On top of that, if the sender ran out of messages, it won't
-            // send more until we respond to some - which depends on these futures resolving. Sure enough,
-            // the real fix for that second one is to have better communication between sender and receiver,
-            // but that's not realistic ATM. If we did have better negotiation here, we would not need a timer
-            // at all.
-            _delayed_streaming_flush.arm(2s);
-    }
-
-    return with_gate(_streaming_flush_gate, [this, old] {
-        return _waiting_streaming_flushes.get_shared_future();
-    });
-}
-
-future<>
-column_family::seal_active_streaming_memtable() {
-    auto old = _streaming_memtables->back();
-    if (old->empty()) {
-        return make_ready_future<>();
-    }
-    _streaming_memtables->add_memtable();
-    _streaming_memtables->erase(old);
-    return with_gate(_streaming_flush_gate, [this, old] {
-        _delayed_streaming_flush.cancel();
-
-        auto current_waiters = std::exchange(_waiting_streaming_flushes, shared_promise<>());
-        auto f = current_waiters.get_shared_future(); // for this seal
-
-        with_lock(_sstables_lock.for_read(), [this, old] {
-            auto newtab = make_lw_shared<sstables::sstable>(_schema->ks_name(), _schema->cf_name(),
-                _config.datadir, calculate_generation_for_new_table(),
-                sstables::sstable::version_types::ka,
-                sstables::sstable::format_types::big);
-
-            newtab->set_unshared();
-
-            auto&& priority = service::get_local_streaming_write_priority();
-            // This is somewhat similar to the main memtable flush, but with important differences.
-            //
-            // The first difference, is that we don't keep aggregate collectd statistics about this one.
-            // If we ever need to, we'll keep them separate statistics, but we don't want to polute the
-            // main stats about memtables with streaming memtables.
-            //
-            // Second, we will not bother touching the cache after this flush. The current streaming code
-            // will invalidate the ranges it touches, so we won't do it twice. Even when that changes, the
-            // cache management code in here will have to differ from the main memtable's one. Please see
-            // the comment at flush_streaming_mutations() for details.
-            //
-            // Lastly, we don't have any commitlog RP to update, and we don't need to deal manipulate the
-            // memtable list, since this memtable was not available for reading up until this point.
-            return newtab->write_components(*old, incremental_backups_enabled(), priority).then([this, newtab, old] {
-                return newtab->open_data();
-            }).then([this, old, newtab] () {
-                add_sstable(newtab);
-                trigger_compaction();
-            }).handle_exception([] (auto ep) {
-                dblog.error("failed to write streamed sstable: {}", ep);
-                return make_exception_future<>(ep);
-            });
-            // We will also not have any retry logic. If we fail here, we'll fail the streaming and let
-            // the upper layers know. They can then apply any logic they want here.
-        }).then_wrapped([this, current_waiters = std::move(current_waiters)] (future <> f) mutable {
-            if (f.failed()) {
-                current_waiters.set_exception(f.get_exception());
-            } else {
-                current_waiters.set_value();
-            }
-        });
-
-        return f;
-    });
-}
-
 future<>
 column_family::seal_active_memtable() {
    auto old = _memtables->back();
@@ -667,7 +563,7 @@ column_family::seal_active_memtable() {
        dblog.debug("Memtable is empty");
        return make_ready_future<>();
    }
-    _memtables->add_memtable();
+    add_memtable();

    assert(_highest_flushed_rp < old->replay_position()
    || (_highest_flushed_rp == db::replay_position() && old->replay_position() == db::replay_position())
@@ -741,7 +637,7 @@ column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
                    dblog.error("failed to move memtable to cache: {}", std::current_exception());
                }

-                _memtables->erase(old);
+                _memtables->erase(boost::range::find(*_memtables, old));
                dblog.debug("Memtable replaced");

                return make_ready_future<stop_iteration>(stop_iteration::yes);
@@ -764,39 +660,28 @@ column_family::start() {
 future<>
 column_family::stop() {
    seal_active_memtable();
-    seal_active_streaming_memtable();
    return _compaction_manager.remove(this).then([this] {
-        // Nest, instead of using when_all, so we don't lose any exceptions.
-        return _flush_queue->close().then([this] {
-            return _streaming_flush_gate.close();
-        });
-    }).then([this] {
-        return _sstable_deletion_gate.close();
+        return _flush_queue->close();
    });
 }


 future<std::vector<sstables::entry_descriptor>>
-column_family::reshuffle_sstables(std::set<int64_t> all_generations, int64_t start) {
+column_family::reshuffle_sstables(int64_t start) {
    struct work {
        int64_t current_gen;
-        std::set<int64_t> all_generations; // Stores generation of all live sstables in the system.
        sstable_list sstables;
        std::unordered_map<int64_t, sstables::entry_descriptor> descriptors;
        std::vector<sstables::entry_descriptor> reshuffled;
-        work(int64_t start, std::set<int64_t> gens)
-            : current_gen(start ? start : 1)
-            , all_generations(gens) {}
+        work(int64_t start) : current_gen(start ? start : 1) {}
    };

-    return do_with(work(start, std::move(all_generations)), [this] (work& work) {
+    return do_with(work(start), [this] (work& work) {
        return lister::scan_dir(_config.datadir, { directory_entry_type::regular }, [this, &work] (directory_entry de) {
            auto comps = sstables::entry_descriptor::make_descriptor(de.name);
            if (comps.component != sstables::sstable::component_type::TOC) {
                return make_ready_future<>();
-            }
-            // Skip generations that were already loaded by Scylla at a previous stage.
-            if (work.all_generations.count(comps.generation) != 0) {
+            } else if (comps.generation < work.current_gen) {
                return make_ready_future<>();
            }
            auto sst = make_lw_shared<sstables::sstable>(_schema->ks_name(), _schema->cf_name(),
@@ -834,21 +719,6 @@ column_family::reshuffle_sstables(std::set<int64_t> all_generations, int64_t sta
    });
 }

-void column_family::rebuild_statistics() {
-    // zeroing live_disk_space_used and live_sstable_count because the
-    // sstable list was re-created
-    _stats.live_disk_space_used = 0;
-    _stats.live_sstable_count = 0;
-
-    for (auto&& tab : boost::range::join(_sstables_compacted_but_not_deleted,
-                    // this might seem dangerous, but "move" here just avoids constness,
-                    // making the two ranges compatible when compiling with boost 1.55.
-                    // Noone is actually moving anything...
-                                         std::move(*_sstables) | boost::adaptors::map_values)) {
-        update_stats_for_new_sstable(tab->data_size());
-    }
-}
-
 void
 column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>& new_sstables,
                                    const std::vector<sstables::shared_sstable>& sstables_to_remove) {
@@ -857,53 +727,37 @@ column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>&
    // later), and we add the new tables generated by the compaction.
    // We create a new list rather than modifying it in-place, so that
    // on-going reads can continue to use the old list.
-    //
-    // We only remove old sstables after they are successfully deleted,
-    // to avoid a new compaction from ignoring data in the old sstables
-    // if the deletion fails (note deletion of shared sstables can take
-    // unbounded time, because all shards must agree on the deletion).
    auto current_sstables = _sstables;
    auto new_sstable_list = make_lw_shared<sstable_list>();
-    auto new_compacted_but_not_deleted = _sstables_compacted_but_not_deleted;

+    // zeroing live_disk_space_used and live_sstable_count because the
+    // sstable list is re-created below.
+    _stats.live_disk_space_used = 0;
+    _stats.live_sstable_count = 0;

    std::unordered_set<sstables::shared_sstable> s(
           sstables_to_remove.begin(), sstables_to_remove.end());

-    // First, add the new sstables.
-
-    // this might seem dangerous, but "move" here just avoids constness,
-    // making the two ranges compatible when compiling with boost 1.55.
-    // Noone is actually moving anything...
-    for (auto&& tab : boost::range::join(new_sstables, std::move(*current_sstables) | boost::adaptors::map_values)) {
+    for (const auto& oldtab : *current_sstables) {
        // Checks if oldtab is a sstable not being compacted.
-        if (!s.count(tab)) {
-            new_sstable_list->emplace(tab->generation(), tab);
-        } else {
-            new_compacted_but_not_deleted.push_back(tab);
+        if (!s.count(oldtab.second)) {
+            update_stats_for_new_sstable(oldtab.second->data_size());
+            new_sstable_list->emplace(oldtab.first, oldtab.second);
        }
    }
+
+    for (const auto& newtab : new_sstables) {
+        // FIXME: rename the new sstable(s). Verify a rename doesn't cause
+        // problems for the sstable object.
+        update_stats_for_new_sstable(newtab->data_size());
+        new_sstable_list->emplace(newtab->generation(), newtab);
+    }
+
+    for (const auto& oldtab : sstables_to_remove) {
+        oldtab->mark_for_deletion();
+    }
+
    _sstables = std::move(new_sstable_list);
-    _sstables_compacted_but_not_deleted = std::move(new_compacted_but_not_deleted);
-
-    rebuild_statistics();
-
-    // Second, delete the old sstables.  This is done in the background, so we can
-    // consider this compaction completed.
-    seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
-        return sstables::delete_atomically(sstables_to_remove).then([this, sstables_to_remove] {
-            auto current_sstables = _sstables;
-            auto new_sstable_list = make_lw_shared<sstable_list>();
-
-            std::unordered_set<sstables::shared_sstable> s(
-                   sstables_to_remove.begin(), sstables_to_remove.end());
-            auto e = boost::range::remove_if(_sstables_compacted_but_not_deleted, [&] (sstables::shared_sstable sst) -> bool {
-                return s.count(sst);
-            });
-            _sstables_compacted_but_not_deleted.erase(e, _sstables_compacted_but_not_deleted.end());
-            rebuild_statistics();
-        });
-    });
 }

 future<>
@@ -927,7 +781,7 @@ column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool
        };
        return sstables::compact_sstables(*sstables_to_compact, *this, create_sstable, descriptor.max_sstable_bytes, descriptor.level,
                cleanup).then([this, sstables_to_compact] (auto new_sstables) {
-            return this->rebuild_sstable_list(new_sstables, *sstables_to_compact);
+            this->rebuild_sstable_list(new_sstables, *sstables_to_compact);
        });
    });
 }
@@ -1058,24 +912,6 @@ lw_shared_ptr<sstable_list> column_family::get_sstables() {
    return _sstables;
 }

-// Gets the list of all sstables in the column family, including ones that are
-// not used for active queries because they have already been compacted, but are
-// waiting for delete_atomically() to return.
-//
-// As long as we haven't deleted them, compaction needs to ensure it doesn't
-// garbage-collect a tombstone that covers data in an sstable that may not be
-// successfully deleted.
-lw_shared_ptr<sstable_list> column_family::get_sstables_including_compacted_undeleted() {
-    if (_sstables_compacted_but_not_deleted.empty()) {
-        return _sstables;
-    }
-    auto ret = make_lw_shared(*_sstables);
-    for (auto&& s : _sstables_compacted_but_not_deleted) {
-        ret->insert(std::make_pair(s->generation(), s));
-    }
-    return ret;
-}
-
 inline bool column_family::manifest_json_filter(const sstring& fname) {
    using namespace boost::filesystem;

@@ -1191,24 +1027,13 @@ database::database() : database(db::config())
 {}

 database::database(const db::config& cfg)
-    : _streaming_dirty_memory_region_group(&_dirty_memory_region_group)
-    , _cfg(std::make_unique<db::config>(cfg))
-    , _memtable_total_space([this] {
-        auto memtable_total_space = size_t(_cfg->memtable_total_space_in_mb()) << 20;
-        if (!memtable_total_space) {
-            return memory::stats().total_memory() / 2;
-        }
-        return memtable_total_space;
-    }())
-    , _streaming_memtable_total_space(_memtable_total_space / 4)
+    : _cfg(std::make_unique<db::config>(cfg))
    , _version(empty_version)
-    , _enable_incremental_backups(cfg.incremental_backups())
-    , _memtables_throttler(_memtable_total_space, _dirty_memory_region_group)
-    , _streaming_throttler(_streaming_memtable_total_space,
-                           _streaming_dirty_memory_region_group,
-                           &_memtables_throttler
-    )
 {
+    _memtable_total_space = size_t(_cfg->memtable_total_space_in_mb()) << 20;
+    if (!_memtable_total_space) {
+        _memtable_total_space = memory::stats().total_memory() / 2;
+    }
    // Start compaction manager with two tasks for handling compaction jobs.
    _compaction_manager.start(2);
    setup_collectd();
@@ -1356,13 +1181,12 @@ future<> database::parse_system_tables(distributed<service::storage_proxy>& prox
    }).then([&proxy, this] {
        return do_parse_system_tables(proxy, db::schema_tables::COLUMNFAMILIES, [this, &proxy] (schema_result_value_type &v) {
            return create_tables_from_tables_partition(proxy, v.second).then([this] (std::map<sstring, schema_ptr> tables) {
-                return parallel_for_each(tables.begin(), tables.end(), [this] (auto& t) {
+                for (auto& t: tables) {
                    auto s = t.second;
                    auto& ks = this->find_keyspace(s->ks_name());
                    auto cfg = ks.make_column_family_config(*s);
-                    this->add_column_family(s, std::move(cfg));
-                    return ks.make_directory_for_column_family(s->cf_name(), s->id()).then([s] {});
-                });
+                    this->add_column_family(std::move(s), std::move(cfg));
+                }
            });
        });
    });
@@ -1557,10 +1381,6 @@ const column_family& database::find_column_family(const utils::UUID& uuid) const
    }
 }

-bool database::column_family_exists(const utils::UUID& uuid) const {
-    return _column_families.count(uuid);
-}
-
 void
 keyspace::create_replication_strategy(const std::map<sstring, sstring>& options) {
    using namespace locator;
@@ -1597,9 +1417,7 @@ keyspace::make_column_family_config(const schema& s) const {
    cfg.enable_commitlog = _config.enable_commitlog;
    cfg.enable_cache = _config.enable_cache;
    cfg.max_memtable_size = _config.max_memtable_size;
-    cfg.max_streaming_memtable_size = _config.max_streaming_memtable_size;
    cfg.dirty_memory_region_group = _config.dirty_memory_region_group;
-    cfg.streaming_dirty_memory_region_group = _config.streaming_dirty_memory_region_group;
    cfg.cf_stats = _config.cf_stats;
    cfg.enable_incremental_backups = _config.enable_incremental_backups;

@@ -1744,11 +1562,10 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
 struct query_state {
    explicit query_state(schema_ptr s,
                         const query::read_command& cmd,
-                         query::result_request request,
                         const std::vector<query::partition_range>& ranges)
            : schema(std::move(s))
            , cmd(cmd)
-            , builder(cmd.slice, request)
+            , builder(cmd.slice)
            , limit(cmd.row_limit)
            , current_partition_range(ranges.begin())
            , range_end(ranges.end()){
@@ -1767,20 +1584,29 @@ struct query_state {
 };

 future<lw_shared_ptr<query::result>>
-column_family::query(schema_ptr s, const query::read_command& cmd, query::result_request request, const std::vector<query::partition_range>& partition_ranges) {
+column_family::query(schema_ptr s, const query::read_command& cmd, const std::vector<query::partition_range>& partition_ranges) {
    utils::latency_counter lc;
    _stats.reads.set_latency(lc);
-    auto qs_ptr = std::make_unique<query_state>(std::move(s), cmd, request, partition_ranges);
+    auto qs_ptr = std::make_unique<query_state>(std::move(s), cmd, partition_ranges);
    auto& qs = *qs_ptr;
    {
        return do_until(std::bind(&query_state::done, &qs), [this, &qs] {
            auto&& range = *qs.current_partition_range++;
-            auto add_partition = [&qs] (uint32_t live_rows, mutation&& m) {
-                auto pb = qs.builder.add_partition(*qs.schema, m.key());
-                m.partition().query_compacted(pb, *qs.schema, live_rows);
-            };
-            return do_with(querying_reader(qs.schema, as_mutation_source(), range, qs.cmd.slice, qs.limit, qs.cmd.timestamp, add_partition),
-                           [] (auto&& rd) { return rd.read(); });
+            qs.reader = make_reader(qs.schema, range, service::get_local_sstable_query_read_priority());
+            qs.range_empty = false;
+            return do_until([&qs] { return !qs.limit || qs.range_empty; }, [&qs] {
+                return qs.reader().then([&qs](mutation_opt mo) {
+                    if (mo) {
+                        auto p_builder = qs.builder.add_partition(*mo->schema(), mo->key());
+                        auto is_distinct = qs.cmd.slice.options.contains(query::partition_slice::option::distinct);
+                        auto limit = !is_distinct ? qs.limit : 1;
+                        auto rows_added = mo->partition().query(p_builder, *qs.schema, qs.cmd.timestamp, limit);
+                        qs.limit -= rows_added;
+                    } else {
+                        qs.range_empty = true;
+                    }
+                });
+            });
        }).then([qs_ptr = std::move(qs_ptr), &qs] {
            return make_ready_future<lw_shared_ptr<query::result>>(
                    make_lw_shared<query::result>(qs.builder.build()));
@@ -1801,9 +1627,9 @@ column_family::as_mutation_source() const {
 }

 future<lw_shared_ptr<query::result>>
-database::query(schema_ptr s, const query::read_command& cmd, query::result_request request, const std::vector<query::partition_range>& ranges) {
+database::query(schema_ptr s, const query::read_command& cmd, const std::vector<query::partition_range>& ranges) {
    column_family& cf = find_column_family(cmd.cf_id);
-    return cf.query(std::move(s), cmd, request, ranges);
+    return cf.query(std::move(s), cmd, ranges);
 }

 future<reconcilable_result>
@@ -1878,8 +1704,8 @@ void
 column_family::apply(const mutation& m, const db::replay_position& rp) {
    utils::latency_counter lc;
    _stats.writes.set_latency(lc);
-    _memtables->active_memtable().apply(m, rp);
-    _memtables->seal_on_overflow();
+    active_memtable().apply(m, rp);
+    seal_on_overflow();
    _stats.writes.mark(lc);
    if (lc.is_start()) {
        _stats.estimated_write.add(lc.latency(), _stats.writes.count);
@@ -1891,17 +1717,21 @@ column_family::apply(const frozen_mutation& m, const schema_ptr& m_schema, const
    utils::latency_counter lc;
    _stats.writes.set_latency(lc);
    check_valid_rp(rp);
-    _memtables->active_memtable().apply(m, m_schema, rp);
-    _memtables->seal_on_overflow();
+    active_memtable().apply(m, m_schema, rp);
+    seal_on_overflow();
    _stats.writes.mark(lc);
    if (lc.is_start()) {
        _stats.estimated_write.add(lc.latency(), _stats.writes.count);
    }
 }

-void column_family::apply_streaming_mutation(schema_ptr m_schema, const frozen_mutation& m) {
-    _streaming_memtables->active_memtable().apply(m, m_schema);
-    _streaming_memtables->seal_on_overflow();
+void
+column_family::seal_on_overflow() {
+    if (active_memtable().occupancy().total_space() >= _config.max_memtable_size) {
+        // FIXME: if sparse, do some in-memory compaction first
+        // FIXME: maybe merge with other in-memory memtables
+        seal_active_memtable();
+    }
 }

 void
@@ -1950,8 +1780,9 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m) {
    return apply_in_memory(m, s, db::replay_position());
 }

-future<> throttle_state::throttle() {
-    if (!should_throttle() && _throttled_requests.empty()) {
+future<> database::throttle() {
+    if (_dirty_memory_region_group.memory_used() < _memtable_total_space
+            && _throttled_requests.empty()) {
        // All is well, go ahead
        return make_ready_future<>();
    }
@@ -1963,13 +1794,13 @@ future<> throttle_state::throttle() {
    return _throttled_requests.back().get_future();
 }

-void throttle_state::unthrottle() {
+void database::unthrottle() {
    // Release one request per free 1MB we have
    // FIXME: improve this
-    if (should_throttle()) {
+    if (_dirty_memory_region_group.memory_used() >= _memtable_total_space) {
        return;
    }
-    size_t avail = std::max((_max_space - _region_group.memory_used()) >> 20, size_t(1));
+    size_t avail = (_memtable_total_space - _dirty_memory_region_group.memory_used()) >> 20;
    avail = std::min(_throttled_requests.size(), avail);
    for (size_t i = 0; i < avail; ++i) {
        _throttled_requests.front().set_value();
@@ -1984,39 +1815,11 @@ future<> database::apply(schema_ptr s, const frozen_mutation& m) {
    if (dblog.is_enabled(logging::log_level::trace)) {
        dblog.trace("apply {}", m.pretty_printer(s));
    }
-    return _memtables_throttler.throttle().then([this, &m, s = std::move(s)] {
+    return throttle().then([this, &m, s = std::move(s)] {
        return do_apply(std::move(s), m);
    });
 }

-future<> database::apply_streaming_mutation(schema_ptr s, const frozen_mutation& m) {
-    if (!s->is_synced()) {
-        throw std::runtime_error(sprint("attempted to mutate using not synced schema of %s.%s, version=%s",
-                                 s->ks_name(), s->cf_name(), s->version()));
-    }
-
-    // TODO (maybe): This will use the same memory region group as memtables, so when
-    // one of them throttles, both will.
-    //
-    // It would be possible to provide further QoS for CQL originated memtables
-    // by keeping the streaming memtables into a different region group, with its own
-    // separate limit.
-    //
-    // Because, however, there are many other limits in play that may kick in,
-    // I am not convinced that this will ever be a problem.
-    //
-    // If we do find ourselves in the situation that we are throttling incoming
-    // writes due to high level of streaming writes, and we are sure that this
-    // is the best solution, we can just change the memtable creation method so
-    // that each kind of memtable creates from a different region group - and then
-    // update the throttle conditions accordingly.
-    return _streaming_throttler.throttle().then([this, &m, s = std::move(s)] {
-        auto uuid = m.column_family_id();
-        auto& cf = find_column_family(uuid);
-        cf.apply_streaming_mutation(s, std::move(m));
-    });
-}
-
 keyspace::config
 database::make_keyspace_config(const keyspace_metadata& ksm) {
    // FIXME support multiple directories
@@ -2028,10 +1831,6 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        cfg.enable_commitlog = ksm.durable_writes() && _cfg->enable_commitlog() && !_cfg->enable_in_memory_data_store();
        cfg.enable_cache = _cfg->enable_cache();
        cfg.max_memtable_size = _memtable_total_space * _cfg->memtable_cleanup_threshold();
-        // We should guarantee that at least two memtable are available, otherwise after flush, adding another memtable would
-        // easily take us into throttling until the first one is flushed.
-        cfg.max_streaming_memtable_size = std::min(cfg.max_memtable_size, _streaming_memtable_total_space / 2);
-
    } else {
        cfg.datadir = "";
        cfg.enable_disk_writes = false;
@@ -2039,13 +1838,10 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        cfg.enable_commitlog = false;
        cfg.enable_cache = false;
        cfg.max_memtable_size = std::numeric_limits<size_t>::max();
-        // All writes should go to the main memtable list if we're not durable
-        cfg.max_streaming_memtable_size = 0;
    }
    cfg.dirty_memory_region_group = &_dirty_memory_region_group;
-    cfg.streaming_dirty_memory_region_group = &_streaming_dirty_memory_region_group;
    cfg.cf_stats = &_cf_stats;
-    cfg.enable_incremental_backups = _enable_incremental_backups;
+    cfg.enable_incremental_backups = _cfg->incremental_backups();
    return cfg;
 }

@@ -2496,36 +2292,10 @@ future<> column_family::flush(const db::replay_position& pos) {
    return seal_active_memtable();
 }

-// FIXME: We can do much better than this in terms of cache management. Right
-// now, we only have to flush the touched ranges because of the possibility of
-// streaming containing token ownership changes.
-//
-// Right now we can't differentiate between that and a normal repair process,
-// so we always flush. When we can differentiate those streams, we should not
-// be indiscriminately touching the cache during repair. We will just have to
-// invalidate the entries that are relevant to things we already have in the cache.
-future<> column_family::flush_streaming_mutations(std::vector<query::partition_range> ranges) {
-    // This will effectively take the gate twice for this call. The proper way to fix that would
-    // be to change seal_active_streaming_memtable_delayed to take a range parameter. However, we
-    // need this code to go away as soon as we can (see FIXME above). So the double gate is a better
-    // temporary counter measure.
-    return with_gate(_streaming_flush_gate, [this, ranges = std::move(ranges)] {
-        return seal_active_streaming_memtable_delayed().finally([this, ranges = std::move(ranges)] {
-            if (_config.enable_cache) {
-                for (auto& range : ranges) {
-                    _cache.invalidate(range);
-                }
-            }
-        });
-    });
-}
-
 void column_family::clear() {
    _cache.clear();
    _memtables->clear();
-    _memtables->add_memtable();
-    _streaming_memtables->clear();
-    _streaming_memtables->add_memtable();
+    add_memtable();
 }

 // NOTE: does not need to be futurized, but might eventually, depending on
@@ -2539,26 +2309,21 @@ future<db::replay_position> column_family::discard_sstables(db_clock::time_point
        auto gc_trunc = to_gc_clock(truncated_at);

        auto pruned = make_lw_shared<sstable_list>();
-        std::vector<sstables::shared_sstable> remove;

        for (auto&p : *_sstables) {
            if (p.second->max_data_age() <= gc_trunc) {
                rp = std::max(p.second->get_stats_metadata().position, rp);
-                remove.emplace_back(p.second);
+                p.second->mark_for_deletion();
                continue;
            }
            pruned->emplace(p.first, p.second);
        }

        _sstables = std::move(pruned);
+
        dblog.debug("cleaning out row cache");
        _cache.clear();
-
-        return parallel_for_each(remove, [](sstables::shared_sstable s) {
-            return sstables::delete_atomically({s});
-        }).then([rp] {
-            return make_ready_future<db::replay_position>(rp);
-        }).finally([remove] {}); // keep the objects alive until here.
+        return make_ready_future<db::replay_position>(rp);
    });
 }

@@ -2604,10 +2369,6 @@ void column_family::set_schema(schema_ptr s) {
        m->set_schema(s);
    }

-    for (auto& m : *_streaming_memtables) {
-        m->set_schema(s);
-    }
-
    _cache.set_schema(s);
    _schema = std::move(s);
 }
--- a/database.hh
+++ b/database.hh
@@ -41,7 +41,6 @@
 #include <set>
 #include <iostream>
 #include <boost/functional/hash.hpp>
-#include <boost/range/algorithm/find.hpp>
 #include <experimental/optional>
 #include <string.h>
 #include "types.hh"
@@ -71,7 +70,6 @@
 #include "sstables/compaction.hh"
 #include "key_reader.hh"
 #include <seastar/core/rwlock.hh>
-#include <seastar/core/shared_future.hh>

 class frozen_mutation;
 class reconcilable_result;
@@ -98,132 +96,9 @@ void make(database& db, bool durable, bool volatile_testing_only);
 }
 }

-class throttle_state {
-    size_t _max_space;
-    logalloc::region_group& _region_group;
-    throttle_state* _parent;
-
-    circular_buffer<promise<>> _throttled_requests;
-    timer<> _throttling_timer{[this] { unthrottle(); }};
-    void unthrottle();
-    bool should_throttle() const {
-        if (_region_group.memory_used() > _max_space) {
-            return true;
-        }
-        if (_parent) {
-            return _parent->should_throttle();
-        }
-        return false;
-    }
-public:
-    throttle_state(size_t max_space, logalloc::region_group& region, throttle_state* parent = nullptr)
-        : _max_space(max_space)
-        , _region_group(region)
-        , _parent(parent)
-    {}
-
-    future<> throttle();
-};
-
-
 class replay_position_reordered_exception : public std::exception {};

-// We could just add all memtables, regardless of types, to a single list, and
-// then filter them out when we read them. Here's why I have chosen not to do
-// it:
-//
-// First, some of the methods in which a memtable is involved (like seal) are
-// assume a commitlog, and go through great care of updating the replay
-// position, flushing the log, etc.  We want to bypass those, and that has to
-// be done either by sprikling the seal code with conditionals, or having a
-// separate method for each seal.
-//
-// Also, if we ever want to put some of the memtables in as separate allocator
-// region group to provide for extra QoS, having the classes properly wrapped
-// will make that trivial: just pass a version of new_memtable() that puts it
-// in a different region, while the list approach would require a lot of
-// conditionals as well.
-//
-// If we are going to have different methods, better have different instances
-// of a common class.
-class memtable_list {
-    using shared_memtable = lw_shared_ptr<memtable>;
-    std::vector<shared_memtable> _memtables;
-    std::function<future<> ()> _seal_fn;
-    std::function<schema_ptr()> _current_schema;
-    size_t _max_memtable_size;
-    logalloc::region_group* _dirty_memory_region_group;
-public:
-    memtable_list(std::function<future<> ()> seal_fn, std::function<schema_ptr()> cs, size_t max_memtable_size, logalloc::region_group* region_group)
-        : _memtables({})
-        , _seal_fn(seal_fn)
-        , _current_schema(cs)
-        , _max_memtable_size(max_memtable_size)
-        , _dirty_memory_region_group(region_group) {
-        add_memtable();
-    }
-
-    shared_memtable back() {
-        return _memtables.back();
-    }
-
-    // The caller has to make sure the element exist before calling this.
-    void erase(const shared_memtable& element) {
-        _memtables.erase(boost::range::find(_memtables, element));
-    }
-    void clear() {
-        _memtables.clear();
-    }
-
-    size_t size() const {
-        return _memtables.size();
-    }
-
-    future<> seal_active_memtable() {
-        return _seal_fn();
-    }
-
-    auto begin() noexcept {
-        return _memtables.begin();
-    }
-
-    auto begin() const noexcept {
-        return _memtables.begin();
-    }
-
-    auto end() noexcept {
-        return _memtables.end();
-    }
-
-    auto end() const noexcept {
-        return _memtables.end();
-    }
-
-    memtable& active_memtable() {
-        return *_memtables.back();
-    }
-
-    void add_memtable() {
-        _memtables.emplace_back(new_memtable());
-    }
-
-    bool should_flush() {
-        return active_memtable().occupancy().total_space() >= _max_memtable_size;
-    }
-
-    void seal_on_overflow() {
-        if (should_flush()) {
-            // FIXME: if sparse, do some in-memory compaction first
-            // FIXME: maybe merge with other in-memory memtables
-            _seal_fn();
-        }
-    }
-private:
-    lw_shared_ptr<memtable> new_memtable() {
-        return make_lw_shared<memtable>(_current_schema(), _dirty_memory_region_group);
-    }
-};
-
+using memtable_list = std::vector<lw_shared_ptr<memtable>>;
 using sstable_list = sstables::sstable_list;

 // The CF has a "stats" structure. But we don't want all fields here,
@@ -246,9 +121,7 @@ public:
        bool enable_commitlog = true;
        bool enable_incremental_backups = false;
        size_t max_memtable_size = 5'000'000;
-        size_t max_streaming_memtable_size = 5'000'000;
        logalloc::region_group* dirty_memory_region_group = nullptr;
-        logalloc::region_group* streaming_dirty_memory_region_group = nullptr;
        ::cf_stats* cf_stats = nullptr;
    };
    struct no_commitlog {};
@@ -280,37 +153,8 @@ private:
    config _config;
    stats _stats;
    lw_shared_ptr<memtable_list> _memtables;
-
-    // In older incarnations, we simply commited the mutations to memtables.
-    // However, doing that makes it harder for us to provide QoS within the
-    // disk subsystem. Keeping them in separate memtables allow us to properly
-    // classify those streams into its own I/O class
-    //
-    // We could write those directly to disk, but we still want the mutations
-    // coming through the wire to go to a memtable staging area.  This has two
-    // major advantages:
-    //
-    // first, it will allow us to properly order the partitions. They are
-    // hopefuly sent in order but we can't really guarantee that without
-    // sacrificing sender-side parallelism.
-    //
-    // second, we will be able to coalesce writes from multiple plan_id's and
-    // even multiple senders, as well as automatically tapping into the dirty
-    // memory throttling mechanism, guaranteeing we will not overload the
-    // server.
-    lw_shared_ptr<memtable_list> _streaming_memtables;
-
-    lw_shared_ptr<memtable_list> make_memtable_list();
-    lw_shared_ptr<memtable_list> make_streaming_memtable_list();
-
    // generation -> sstable. Ordered by key so we can easily get the most recent.
    lw_shared_ptr<sstable_list> _sstables;
-    // sstables that have been compacted (so don't look up in query) but
-    // have not been deleted yet, so must not GC any tombstones in other sstables
-    // that may delete data in these sstables:
-    std::vector<sstables::shared_sstable> _sstables_compacted_but_not_deleted;
-    // Control background fibers waiting for sstables to be deleted
-    seastar::gate _sstable_deletion_gate;
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
    rwlock _sstables_lock;
@@ -327,20 +171,11 @@ private:
    int _compaction_disabled = 0;
    class memtable_flush_queue;
    std::unique_ptr<memtable_flush_queue> _flush_queue;
-    // Because streaming mutations bypass the commitlog, there is
-    // no need for the complications of the flush queue. Besides, it
-    // is easier to just use a common gate than it is to modify the flush_queue
-    // to work both with and without a replay position.
-    //
-    // Last but not least, we seldom need to guarantee any ordering here: as long
-    // as all data is waited for, we're good.
-    seastar::gate _streaming_flush_gate;
 private:
    void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable);
    void add_sstable(sstables::sstable&& sstable);
    void add_sstable(lw_shared_ptr<sstables::sstable> sstable);
-    lw_shared_ptr<memtable> new_memtable();
-    lw_shared_ptr<memtable> new_streaming_memtable();
+    void add_memtable();
    future<stop_iteration> try_flush_memtable_to_sstable(lw_shared_ptr<memtable> memt);
    future<> update_cache(memtable&, lw_shared_ptr<sstable_list> old_sstables);
    struct merge_comparator;
@@ -363,7 +198,6 @@ private:
    // Rebuild existing _sstables with new_sstables added to it and sstables_to_remove removed from it.
    void rebuild_sstable_list(const std::vector<sstables::shared_sstable>& new_sstables,
                              const std::vector<sstables::shared_sstable>& sstables_to_remove);
-    void rebuild_statistics();
 private:
    // Creates a mutation reader which covers sstables.
    // Caller needs to ensure that column_family remains live (FIXME: relax this).
@@ -417,7 +251,7 @@ public:
    // FIXME: in case a query is satisfied from a single memtable, avoid a copy
    using const_mutation_partition_ptr = std::unique_ptr<const mutation_partition>;
    using const_row_ptr = std::unique_ptr<const row>;
-    memtable& active_memtable() { return _memtables->active_memtable(); }
+    memtable& active_memtable() { return *_memtables->back(); }
    const row_cache& get_row_cache() const {
        return _cache;
    }
@@ -442,11 +276,10 @@ public:
    // The mutation is always upgraded to current schema.
    void apply(const frozen_mutation& m, const schema_ptr& m_schema, const db::replay_position& = db::replay_position());
    void apply(const mutation& m, const db::replay_position& = db::replay_position());
-    void apply_streaming_mutation(schema_ptr, const frozen_mutation&);

    // Returns at most "cmd.limit" rows
    future<lw_shared_ptr<query::result>> query(schema_ptr,
-        const query::read_command& cmd, query::result_request request,
+        const query::read_command& cmd,
        const std::vector<query::partition_range>& ranges);

    future<> populate(sstring datadir);
@@ -455,7 +288,6 @@ public:
    future<> stop();
    future<> flush();
    future<> flush(const db::replay_position&);
-    future<> flush_streaming_mutations(std::vector<query::partition_range> ranges = std::vector<query::partition_range>{});
    void clear(); // discards memtable(s) without flushing them to disk.
    future<db::replay_position> discard_sstables(db_clock::time_point);

@@ -466,19 +298,14 @@ public:
    future<int64_t> disable_sstable_write() {
        _sstable_writes_disabled_at = std::chrono::steady_clock::now();
        return _sstables_lock.write_lock().then([this] {
-            if (_sstables->empty()) {
-                return make_ready_future<int64_t>(0);
-            }
-            return make_ready_future<int64_t>((*_sstables->rbegin()).first);
+            return make_ready_future<int64_t>((*_sstables->end()).first);
        });
    }

-    // SSTable writes are now allowed again, and generation is updated to new_generation if != -1
+    // SSTable writes are now allowed again, and generation is updated to new_generation
    // returns the amount of microseconds elapsed since we disabled writes.
    std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation) {
-        if (new_generation != -1) {
-            update_sstables_known_generation(new_generation);
-        }
+        update_sstables_known_generation(new_generation);
        _sstables_lock.write_unlock();
        return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
    }
@@ -492,11 +319,9 @@ public:
    // very dangerous to do that with live SSTables. This is meant to be used with SSTables
    // that are not yet managed by the system.
    //
-    // Parameter all_generations stores the generation of all SSTables in the system, so it
-    // will be easy to determine which SSTable is new.
    // An example usage would query all shards asking what is the highest SSTable number known
    // to them, and then pass that + 1 as "start".
-    future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(std::set<int64_t> all_generations, int64_t start);
+    future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(int64_t start);

    // FIXME: this is just an example, should be changed to something more
    // general. compact_all_sstables() starts a compaction of all sstables.
@@ -530,7 +355,6 @@ public:
    }

    lw_shared_ptr<sstable_list> get_sstables();
-    lw_shared_ptr<sstable_list> get_sstables_including_compacted_undeleted();
    size_t sstables_count();
    int64_t get_unleveled_sstables() const;

@@ -582,31 +406,6 @@ private:
    // synchronously flush data to disk.
    future<> seal_active_memtable();

-    // I am assuming here that the repair process will potentially send ranges containing
-    // few mutations, definitely not enough to fill a memtable. It wants to know whether or
-    // not each of those ranges individually succeeded or failed, so we need a future for
-    // each.
-    //
-    // One of the ways to fix that, is changing the repair itself to send more mutations at
-    // a single batch. But relying on that is a bad idea for two reasons:
-    //
-    // First, the goals of the SSTable writer and the repair sender are at odds. The SSTable
-    // writer wants to write as few SSTables as possible, while the repair sender wants to
-    // break down the range in pieces as small as it can and checksum them individually, so
-    // it doesn't have to send a lot of mutations for no reason.
-    //
-    // Second, even if the repair process wants to process larger ranges at once, some ranges
-    // themselves may be small. So while most ranges would be large, we would still have
-    // potentially some fairly small SSTables lying around.
-    //
-    // The best course of action in this case is to coalesce the incoming streams write-side.
-    // repair can now choose whatever strategy - small or big ranges - it wants, resting assure
-    // that the incoming memtables will be coalesced together.
-    shared_promise<> _waiting_streaming_flushes;
-    timer<> _delayed_streaming_flush{[this] { seal_active_streaming_memtable(); }};
-    future<> seal_active_streaming_memtable();
-    future<> seal_active_streaming_memtable_delayed();
-
    // filter manifest.json files out
    static bool manifest_json_filter(const sstring& fname);

@@ -616,6 +415,7 @@ private:
    template <typename Func>
    future<bool> for_all_partitions(schema_ptr, Func&& func) const;
    future<sstables::entry_descriptor> probe_file(sstring sstdir, sstring fname);
+    void seal_on_overflow();
    void check_valid_rp(const db::replay_position&) const;
 public:
    // Iterate over all partitions.  Protocol is the same as std::all_of(),
@@ -718,9 +518,7 @@ public:
        bool enable_cache = true;
        bool enable_incremental_backups = false;
        size_t max_memtable_size = 5'000'000;
-        size_t max_streaming_memtable_size = 5'000'000;
        logalloc::region_group* dirty_memory_region_group = nullptr;
-        logalloc::region_group* streaming_dirty_memory_region_group = nullptr;
        ::cf_stats* cf_stats = nullptr;
    };
 private:
@@ -782,19 +580,18 @@ public:
 class database {
    ::cf_stats _cf_stats;
    logalloc::region_group _dirty_memory_region_group;
-    logalloc::region_group _streaming_dirty_memory_region_group;
    std::unordered_map<sstring, keyspace> _keyspaces;
    std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
    std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash> _ks_cf_to_uuid;
    std::unique_ptr<db::commitlog> _commitlog;
    std::unique_ptr<db::config> _cfg;
    size_t _memtable_total_space = 500 << 20;
-    size_t _streaming_memtable_total_space = 500 << 20;
    utils::UUID _version;
    // compaction_manager object is referenced by all column families of a database.
    compaction_manager _compaction_manager;
    std::vector<scollectd::registration> _collectd;
-    bool _enable_incremental_backups = false;
+    timer<> _throttling_timer{[this] { unthrottle(); }};
+    circular_buffer<promise<>> _throttled_requests;

    future<> init_commitlog();
    future<> apply_in_memory(const frozen_mutation& m, const schema_ptr& m_schema, const db::replay_position&);
@@ -808,16 +605,12 @@ private:
    void create_in_memory_keyspace(const lw_shared_ptr<keyspace_metadata>& ksm);
    friend void db::system_keyspace::make(database& db, bool durable, bool volatile_testing_only);
    void setup_collectd();
-
-    throttle_state _memtables_throttler;
-    throttle_state _streaming_throttler;
-
+    future<> throttle();
    future<> do_apply(schema_ptr, const frozen_mutation&);
+    void unthrottle();
 public:
    static utils::UUID empty_version;

-    void set_enable_incremental_backups(bool val) { _enable_incremental_backups = val; }
-
    future<> parse_system_tables(distributed<service::storage_proxy>&);
    database();
    database(const db::config&);
@@ -868,7 +661,6 @@ public:
    const column_family& find_column_family(const utils::UUID&) const throw (no_such_column_family);
    column_family& find_column_family(const schema_ptr&) throw (no_such_column_family);
    const column_family& find_column_family(const schema_ptr&) const throw (no_such_column_family);
-    bool column_family_exists(const utils::UUID& uuid) const;
    schema_ptr find_schema(const sstring& ks_name, const sstring& cf_name) const throw (no_such_column_family);
    schema_ptr find_schema(const utils::UUID&) const throw (no_such_column_family);
    bool has_schema(const sstring& ks_name, const sstring& cf_name) const;
@@ -877,10 +669,9 @@ public:
    unsigned shard_of(const dht::token& t);
    unsigned shard_of(const mutation& m);
    unsigned shard_of(const frozen_mutation& m);
-    future<lw_shared_ptr<query::result>> query(schema_ptr, const query::read_command& cmd, query::result_request request, const std::vector<query::partition_range>& ranges);
+    future<lw_shared_ptr<query::result>> query(schema_ptr, const query::read_command& cmd, const std::vector<query::partition_range>& ranges);
    future<reconcilable_result> query_mutations(schema_ptr, const query::read_command& cmd, const query::partition_range& range);
    future<> apply(schema_ptr, const frozen_mutation&);
-    future<> apply_streaming_mutation(schema_ptr, const frozen_mutation&);
    keyspace::config make_keyspace_config(const keyspace_metadata& ksm);
    const sstring& get_snitch_name() const;
    future<> clear_snapshot(sstring tag, std::vector<sstring> keyspace_names);
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1043,9 +1043,7 @@ void db::commitlog::segment_manager::flush_segments(bool force) {

 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
    descriptor d(next_id());
-    file_open_options opt;
-    opt.extent_allocation_size_hint = max_size;
-    return open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create, opt).then([this, d, active](file f) {
+    return open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create).then([this, d, active](file f) {
        // xfs doesn't like files extended betond eof, so enlarge the file
        return f.truncate(max_size).then([this, d, active, f] () mutable {
            auto s = make_lw_shared<segment>(this->shared_from_this(), d, std::move(f), active);
--- a/db/config.hh
+++ b/db/config.hh
@@ -487,7 +487,7 @@ public:
    val(cas_contention_timeout_in_ms, uint32_t, 5000, Unused,     \
            "The time that the coordinator continues to retry a CAS (compare and set) operation that contends with other proposals for the same row."  \
    )   \
-    val(truncate_request_timeout_in_ms, uint32_t, 10000, Used,     \
+    val(truncate_request_timeout_in_ms, uint32_t, 10000, Unused,     \
            "The time that the coordinator waits for truncates (remove all data from a table) to complete. The long default value allows for a snapshot to be taken before removing the data. If auto_snapshot is disabled (not recommended), you can reduce this time."  \
    )   \
    val(write_request_timeout_in_ms, uint32_t, 2000, Used,     \
@@ -556,7 +556,7 @@ public:
    val(start_rpc, bool, false, Used,                \
            "Starts the Thrift RPC server"  \
    )   \
-    val(rpc_keepalive, bool, true, Used,     \
+    val(rpc_keepalive, bool, true, Unused,     \
            "Enable or disable keepalive on client connections (RPC or native)."  \
    )   \
    val(rpc_max_threads, uint32_t, 0, Invalid,     \
--- a/db/consistency_level.cc
+++ b/db/consistency_level.cc
@@ -241,7 +241,7 @@ is_sufficient_live_nodes(consistency_level cl,

        if (rs.get_type() == replication_strategy_type::network_topology) {
            for (auto& entry : count_per_dc_endpoints(ks, live_endpoints)) {
-                if (entry.second.live < local_quorum_for(ks, entry.first)) {
+                if (entry.second < local_quorum_for(ks, entry.first)) {
                    return false;
                }
            }
--- a/db/consistency_level.hh
+++ b/db/consistency_level.hh
@@ -88,16 +88,10 @@ filter_for_query(consistency_level cl,

 std::vector<gms::inet_address> filter_for_query(consistency_level cl, keyspace& ks, std::vector<gms::inet_address>& live_endpoints);

-struct dc_node_count {
-    size_t live = 0;
-    size_t pending = 0;
-};
-
-template <typename Range, typename PendingRange = std::array<gms::inet_address, 0>>
-inline std::unordered_map<sstring, dc_node_count> count_per_dc_endpoints(
+template <typename Range>
+inline std::unordered_map<sstring, size_t> count_per_dc_endpoints(
        keyspace& ks,
-        Range& live_endpoints,
-        const PendingRange& pending_endpoints = std::array<gms::inet_address, 0>()) {
+        Range& live_endpoints) {
    using namespace locator;

    auto& rs = ks.get_replication_strategy();
@@ -106,9 +100,9 @@ inline std::unordered_map<sstring, dc_node_count> count_per_dc_endpoints(
    network_topology_strategy* nrs =
            static_cast<network_topology_strategy*>(&rs);

-    std::unordered_map<sstring, dc_node_count> dc_endpoints;
+    std::unordered_map<sstring, size_t> dc_endpoints;
    for (auto& dc : nrs->get_datacenters()) {
-        dc_endpoints.emplace(dc, dc_node_count());
+        dc_endpoints.emplace(dc, 0);
    }

    //
@@ -117,11 +111,7 @@ inline std::unordered_map<sstring, dc_node_count> count_per_dc_endpoints(
    // nrs->get_datacenters().
    //
    for (auto& endpoint : live_endpoints) {
-        ++(dc_endpoints[snitch_ptr->get_datacenter(endpoint)].live);
-    }
-
-    for (auto& endpoint : pending_endpoints) {
-        ++(dc_endpoints[snitch_ptr->get_datacenter(endpoint)].pending);
+        ++(dc_endpoints[snitch_ptr->get_datacenter(endpoint)]);
    }

    return dc_endpoints;
@@ -132,23 +122,21 @@ is_sufficient_live_nodes(consistency_level cl,
                         keyspace& ks,
                         const std::vector<gms::inet_address>& live_endpoints);

-template<typename Range, typename PendingRange>
+template<typename Range>
 inline bool assure_sufficient_live_nodes_each_quorum(
        consistency_level cl,
        keyspace& ks,
-        Range& live_endpoints,
-        const PendingRange& pending_endpoints) {
+        Range& live_endpoints) {
    using namespace locator;

    auto& rs = ks.get_replication_strategy();

    if (rs.get_type() == replication_strategy_type::network_topology) {
-        for (auto& entry : count_per_dc_endpoints(ks, live_endpoints, pending_endpoints)) {
+        for (auto& entry : count_per_dc_endpoints(ks, live_endpoints)) {
            auto dc_block_for = local_quorum_for(ks, entry.first);
-            auto dc_live = entry.second.live;
-            auto dc_pending = entry.second.pending;
+            auto dc_live = entry.second;

-            if (dc_live < dc_block_for + dc_pending) {
+            if (dc_live < dc_block_for) {
                throw exceptions::unavailable_exception(cl, dc_block_for, dc_live);
            }
        }
@@ -159,12 +147,11 @@ inline bool assure_sufficient_live_nodes_each_quorum(
    return false;
 }

-template<typename Range, typename PendingRange = std::array<gms::inet_address, 0>>
+template<typename Range>
 inline void assure_sufficient_live_nodes(
        consistency_level cl,
        keyspace& ks,
-        Range& live_endpoints,
-        const PendingRange& pending_endpoints = std::array<gms::inet_address, 0>()) {
+        Range& live_endpoints) {
    size_t need = block_for(ks, cl);

    switch (cl) {
@@ -172,13 +159,13 @@ inline void assure_sufficient_live_nodes(
        // local hint is acceptable, and local node is always live
        break;
    case consistency_level::LOCAL_ONE:
-        if (count_local_endpoints(live_endpoints) < count_local_endpoints(pending_endpoints) + 1) {
+        if (count_local_endpoints(live_endpoints) == 0) {
            throw exceptions::unavailable_exception(cl, 1, 0);
        }
        break;
    case consistency_level::LOCAL_QUORUM: {
        size_t local_live = count_local_endpoints(live_endpoints);
-        if (local_live < need + count_local_endpoints(pending_endpoints)) {
+        if (local_live < need) {
 #if 0
            if (logger.isDebugEnabled())
            {
@@ -197,15 +184,14 @@ inline void assure_sufficient_live_nodes(
        break;
    }
    case consistency_level::EACH_QUORUM:
-        if (assure_sufficient_live_nodes_each_quorum(cl, ks, live_endpoints, pending_endpoints)) {
+        if (assure_sufficient_live_nodes_each_quorum(cl, ks, live_endpoints)) {
            break;
        }
    // Fallthough on purpose for SimpleStrategy
    default:
        size_t live = live_endpoints.size();
-        size_t pending = pending_endpoints.size();
-        if (live < need + pending) {
-            cl_logger.debug("Live nodes {} do not satisfy ConsistencyLevel ({} required, {} pending)", live, need, pending);
+        if (live < need) {
+            cl_logger.debug("Live nodes {} do not satisfy ConsistencyLevel ({} required)", live, need);
            throw exceptions::unavailable_exception(cl, need, live);
        }
        break;
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -663,7 +663,7 @@ future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& p
    });
 }

-static future<> update_column_family(database& db, schema_ptr new_schema) {
+static void update_column_family(database& db, schema_ptr new_schema) {
    column_family& cfm = db.find_column_family(new_schema->id());

    bool columns_changed = !cfm.schema()->equal_columns(*new_schema);
@@ -672,7 +672,7 @@ static future<> update_column_family(database& db, schema_ptr new_schema) {
    s->registry_entry()->mark_synced();
    cfm.set_schema(std::move(s));

-    return service::get_local_migration_manager().notify_update_column_family(cfm.schema(), columns_changed);
+    service::get_local_migration_manager().notify_update_column_family(cfm.schema(), columns_changed);
 }

 // see the comments for merge_keyspaces()
@@ -713,15 +713,15 @@ static void merge_tables(distributed<service::storage_proxy>& proxy,
                    auto& cf = db.find_column_family(s);
                    cf.mark_ready_for_writes();
                    ks.make_directory_for_column_family(s->cf_name(), s->id()).get();
-                    service::get_local_migration_manager().notify_create_column_family(s).get();
+                    service::get_local_migration_manager().notify_create_column_family(s);
                }
                for (auto&& gs : altered) {
-                    update_column_family(db, gs.get()).get();
+                    update_column_family(db, gs.get());
                }
                parallel_for_each(dropped.begin(), dropped.end(), [&db, &tsf](auto&& gs) {
                    schema_ptr s = gs.get();
                    return db.drop_column_family(s->ks_name(), s->cf_name(), [&tsf] { return tsf.value(); }).then([s] {
-                        return service::get_local_migration_manager().notify_drop_column_family(s);
+                        service::get_local_migration_manager().notify_drop_column_family(s);
                    });
                }).get();
            });
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -29,74 +29,28 @@ while [ $# -gt 0 ]; do
    esac
 done

-. /etc/os-release
-case "$ID" in
-    "centos")
-        AMI=ami-f3102499
-        REGION=us-east-1
-        SSH_USERNAME=centos
-        ;;
-    "ubuntu")
-        AMI=ami-ff427095
-        REGION=us-east-1
-        SSH_USERNAME=ubuntu
-        ;;
-    *)
-        echo "build_ami.sh does not supported this distribution."
-        exit 1
-        ;;
-esac
-
-
 if [ $LOCALRPM -eq 1 ]; then
-    if [ "$ID" = "centos" ]; then
-        rm -rf build/*
-        sudo yum -y install git
-        if [ ! -f dist/ami/files/scylla-server.x86_64.rpm ]; then
-            dist/redhat/build_rpm.sh
-            cp build/rpmbuild/RPMS/x86_64/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
-        fi
-        if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
-            cd build
-            git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
-            cd scylla-jmx
-            sh -x -e dist/redhat/build_rpm.sh $*
-            cd ../..
-            cp build/scylla-jmx/build/rpmbuild/RPMS/noarch/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
-        fi
-        if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ]; then
-            cd build
-            git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
-            cd scylla-tools-java
-            sh -x -e dist/redhat/build_rpm.sh
-            cd ../..
-            cp build/scylla-tools-java/build/rpmbuild/RPMS/noarch/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
-        fi
-    else
-        sudo apt-get install -y git
-        if [ ! -f dist/ami/files/scylla-server_amd64.deb ]; then
-            if [ ! -f ../scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb ]; then
-                echo "Build .deb before running build_ami.sh"
-                exit 1
-            fi
-            cp ../scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-server_amd64.deb
-        fi
-        if [ ! -f dist/ami/files/scylla-jmx_all.deb ]; then
-            cd build
-            git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
-            cd scylla-jmx
-            sh -x -e dist/ubuntu/build_deb.sh $*
-            cd ../..
-            cp build/scylla-jmx_`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`-ubuntu1_all.deb dist/ami/files/scylla-jmx_all.deb
-        fi
-        if [ ! -f dist/ami/files/scylla-tools_all.deb ]; then
-            cd build
-            git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
-            cd scylla-tools-java
-            sh -x -e dist/ubuntu/build_deb.sh $*
-            cd ../..
-            cp build/scylla-tools_`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`-ubuntu1_all.deb dist/ami/files/scylla-tools_all.deb
-        fi
+    rm -rf build/*
+    sudo yum -y install git
+    if [ ! -f dist/ami/files/scylla-server.x86_64.rpm ]; then
+        dist/redhat/build_rpm.sh
+        cp build/rpmbuild/RPMS/x86_64/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
+    fi
+    if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
+        cd build
+        git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
+        cd scylla-jmx
+        sh -x -e dist/redhat/build_rpm.sh $*
+        cd ../..
+        cp build/scylla-jmx/build/rpmbuild/RPMS/noarch/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
+    fi
+    if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ]; then
+        cd build
+        git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
+        cd scylla-tools-java
+        sh -x -e dist/redhat/build_rpm.sh
+        cd ../..
+        cp build/scylla-tools-java/build/rpmbuild/RPMS/noarch/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
    fi
 fi

@@ -115,4 +69,4 @@ if [ ! -d packer ]; then
    cd -
 fi

-packer/packer build -var-file=variables.json -var install_args="$INSTALL_ARGS" -var region="$REGION" -var source_ami="$AMI" -var ssh_username="$SSH_USERNAME" scylla.json
+packer/packer build -var-file=variables.json -var install_args="$INSTALL_ARGS" scylla.json
--- a/dist/ami/files/.bash_profile
+++ b/dist/ami/files/.bash_profile
@@ -30,21 +30,7 @@ echo 'More documentation available at: '
 echo '	http://www.scylladb.com/doc/'
 echo

-. /etc/os-release
-if [ "$ID" = "ubuntu" ]; then
-	if [ "`initctl status ssh|grep "running, process"`" != "" ]; then
-		STARTED=1
-	else
-		STARTED=0
-	fi
-else
-	if [ "`systemctl is-active scylla-server`" = "active" ]; then
-		STARTED=1
-	else
-		STARTED=0
-	fi
-fi
-if [ $STARTED -eq 1 ]; then
+if [ "`systemctl is-active scylla-server`" = "active" ]; then
 	tput setaf 4
 	tput bold
 	echo "    ScyllaDB is active."
@@ -56,13 +42,6 @@ else
 	echo "    ScyllaDB is not started!"
 	tput sgr0
 	echo "Please wait for startup. To see status of ScyllaDB, run "
-	if [ "$ID" = "ubuntu" ]; then
-		echo " 'initctl status scylla-server'"
-		echo "and"
-		echo " 'cat /var/log/upstart/scylla-server.log'"
-		echo
-	else
-		echo " 'systemctl status scylla-server'"
-		echo
-	fi
+	echo " 'systemctl status scylla-server'"
+	echo
 fi
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/ami/scylla.json
+++ b/dist/ami/scylla.json
@@ -8,10 +8,10 @@
      "security_group_id": "{{user `security_group_id`}}",
      "region": "{{user `region`}}",
      "associate_public_ip_address": "{{user `associate_public_ip_address`}}",
-      "source_ami": "{{user `source_ami`}}",
+      "source_ami": "ami-f3102499",
      "user_data_file": "user_data.txt",
      "instance_type": "{{user `instance_type`}}",
-      "ssh_username": "{{user `ssh_username`}}",
+      "ssh_username": "centos",
      "ssh_timeout": "5m",
      "ami_name": "{{user `ami_prefix`}}scylla_{{isotime | clean_ami_name}}",
      "enhanced_networking": true,
@@ -62,17 +62,17 @@
    {
      "type": "file",
      "source": "files/",
-      "destination": "/home/{{user `ssh_username`}}/"
+      "destination": "/home/centos/"
    },
    {
      "type": "file",
      "source": "../../scripts/scylla_install_pkg",
-      "destination": "/home/{{user `ssh_username`}}/scylla_install_pkg"
+      "destination": "/home/centos/scylla_install_pkg"
    },
    {
      "type": "shell",
      "inline": [
-         "sudo /home/{{user `ssh_username`}}/scylla-ami/scylla_install_ami {{ user `install_args` }}"
+         "sudo /home/centos/scylla-ami/scylla_install_ami {{ user `install_args` }}"
       ]
    }
  ],
@@ -85,8 +85,6 @@
    "associate_public_ip_address": "",
    "instance_type": "",
    "install_args": "",
-    "ami_prefix": "",
-    "source_ami": "",
-    "ssh_username": ""
+    "ami_prefix": ""
  }
 }
--- a/dist/common/collectd.d/scylla.conf
+++ b/dist/common/collectd.d/scylla.conf
@@ -1,12 +1,5 @@
 LoadPlugin network
 LoadPlugin unixsock
-
-# dummy write_graphite to silent noisy warning
-LoadPlugin network
-<Plugin "network">
-        Server "127.0.0.1 65534"
-</Plugin>
-
 <Plugin network>
 	Listen "127.0.0.1" "25826"
 </Plugin>
--- a/dist/common/scripts/scylla_bootparam_setup
+++ b/dist/common/scripts/scylla_bootparam_setup
@@ -2,25 +2,6 @@
 #
 #  Copyright (C) 2015 ScyllaDB

-print_usage() {
-    echo "scylla_bootparam_setup --ami"
-    echo "  --ami				setup AMI instance"
-    exit 1
-}
-
-AMI_OPT=0
-while [ $# -gt 0 ]; do
-    case "$1" in
-        "--ami")
-            AMI_OPT=1
-            shift 1
-            ;;
-        *)
-            print_usage
-            ;;
-    esac
-done
-
 . /etc/os-release

 if [ ! -f /etc/default/grub ]; then
@@ -33,11 +14,7 @@ if [ "`grep hugepagesz /etc/default/grub`" != "" ] || [ "`grep hugepages /etc/de
    sed -e "s#hugepages=[0-9]* ##" /etc/default/grub > /tmp/grub
    mv /tmp/grub /etc/default/grub
 fi
-if [ $AMI_OPT -eq 1 ]; then
-    sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"clocksource=tsc tsc=reliable hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
-else
-    sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
-fi
+sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
 mv /tmp/grub /etc/default/grub
 if [ "$ID" = "ubuntu" ]; then
    grub-mkconfig -o /boot/grub/grub.cfg
--- a/dist/common/scripts/scylla_dev_mode_setup
+++ b/dist/common/scripts/scylla_dev_mode_setup
@@ -1,31 +0,0 @@
-#!/bin/sh -e
-#
-#  Copyright (C) 2015 ScyllaDB
-
-print_usage() {
-    echo "scylla_developer_mode_setup --developer-mode=[0|1]"
-    echo "  --developer-mode   enable/disable developer mode"
-    exit 1
-}
-
-DEV_MODE=
-while [ $# -gt 0 ]; do
-    case "$1" in
-        "--developer-mode")
-            DEV_MODE=$2
-            shift 2
-            ;;
-        *)
-            print_usage
-            ;;
-    esac
-done
-
-if [ "$DEV_MODE" = "" ]; then
-    print_usage
-fi
-if [ "$DEV_MODE" != "0" ] && [ "$DEV_MODE" != "1" ]; then
-    print_usage
-fi
-
-echo "DEV_MODE=--developer-mode=$DEV_MODE" > /etc/scylla.d/dev-mode.conf
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -1,49 +1,34 @@
 #!/bin/sh

-print_usage() {
-    echo "scylla_io_setup --ami"
-    echo "  --ami				setup AMI instance"
-    exit 1
+is_ami() {
+    if [ "`dmidecode --string system-version | grep \.amazon`" != "" ] && \
+       [ "`curl http://169.254.169.254/latest/meta-data/ami-id | grep ami-`" != "" ]; then
+         echo 1
+    else
+         echo 0
+    fi
 }

-AMI_OPT=0
-while [ $# -gt 0 ]; do
-    case "$1" in
-        "--ami")
-            AMI_OPT=1
-            shift 1
-            ;;
-        *)
-            print_usage
-            ;;
+is_supported_instance_type() {
+    TYPE=`curl http://169.254.169.254/latest/meta-data/instance-type|cut -d . -f 1`
+    case $TYPE in
+        "m3"|"c3"|"i2") echo 1;;
+        *) echo 0;;
    esac
-done
-
+}

 is_developer_mode() {
-    cat /etc/scylla.d/dev-mode.conf|egrep -c "\-\-developer-mode(\s+|=)(1|true)"
+    echo $SCYLLA_ARGS|egrep -c "\-\-developer-mode(\s+|=)1"
 }

-output_to_user()
-{
-    echo "$1"
-    logger -p user.err "$1"
-}
-
-. /etc/os-release
-if [ "$NAME" = "Ubuntu" ]; then
-   . /etc/default/scylla-server
-else
-   . /etc/sysconfig/scylla-server
-fi
-
-if [ `is_developer_mode` -eq 0 ]; then
-    SMP=`echo $SCYLLA_ARGS|grep smp|sed -e "s/^.*smp\(\s\+\|=\)\([0-9]*\).*$/\2/"`
-    CPUSET=`echo $SCYLLA_ARGS|grep cpuset|sed -e "s/^.*\(--cpuset\(\s\+\|=\)[0-9\-]*\).*$/\1/"`
-    if [ $AMI_OPT -eq 1 ]; then
+if [ ! -f /etc/scylla/io_configured ] && [ `is_developer_mode` -eq 0 ]; then
+    if [ `is_ami` -eq 1 ]; then
+        SMP=`echo $SCYLLA_ARGS|sed -e "s/^.*smp\(\s\+\|=\)\([0-9]*\).*$/\2/"`
+        CPUSET=`echo $SCYLLA_ARGS|sed -e "s/^.*\(--cpuset\(\s\+\|=\)[0-9\-]*\).*$/\1/"`
+    fi
+    if [ `is_ami` -eq 1 ] && [ `is_supported_instance_type` -eq 1 ]; then
        NR_CPU=`cat /proc/cpuinfo |grep processor|wc -l`
-        NR_DISKS=`lsblk --list --nodeps --noheadings | grep -v xvda | grep xvd | wc -l`
-        TYPE=`curl http://169.254.169.254/latest/meta-data/instance-type|cut -d . -f 1`
+        NR_DISKS=`curl http://169.254.169.254/latest/meta-data/block-device-mapping/|grep ephemeral|wc -l`

        if [ "$SMP" != "" ]; then
            NR_CPU=$SMP
@@ -61,20 +46,17 @@ if [ `is_developer_mode` -eq 0 ]; then
            NR_IO_QUEUES=$(($NR_REQS / 4))
        fi

-        NR_IO_QUEUES=$((NR_IO_QUEUES>NR_SHARDS?NR_SHARDS:NR_IO_QUEUES))
        NR_REQS=$(($(($NR_REQS / $NR_IO_QUEUES)) * $NR_IO_QUEUES))
-        if [ "$TYPE" = "i2" ]; then
-            NR_REQS=$(($NR_REQS * 2))
-        fi

-        echo "SEASTAR_IO=\"--num-io-queues $NR_IO_QUEUES --max-io-requests $NR_REQS\"" > /etc/scylla.d/io.conf
+        echo "SCYLLA_IO=\"--num-io-queues $NR_IO_QUEUES --max-io-requests $NR_REQS\"" > /etc/scylla.d/io.conf
    else
        iotune --evaluation-directory /var/lib/scylla --format envfile --options-file /etc/scylla.d/io.conf $CPUSET
        if [ $? -ne 0 ]; then
-            output_to_user "/var/lib/scylla did not pass validation tests, it may not be on XFS and/or has limited disk space."
-            output_to_user "This is a non-supported setup, and performance is expected to be very bad."
-            output_to_user "For better performance, placing your data on XFS-formatted directories is required."
-            output_to_user " To override this error, see the developer_mode configuration option."
+            logger -p user.err "/var/lib/scylla did not pass validation tests, it may not be on XFS and/or has limited disk space."
+            logger -p user.err "This is a non-supported setup, and performance is expected to be very bad."
+            logger -p user.err "For better performance, placing your data on XFS-formatted directories is required."
+            logger -p user.err " To override this error, see the developer_mode configuration option."
        fi
    fi
+    touch /etc/scylla/io_configured
 fi
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -49,7 +49,7 @@ fi

 . /etc/os-release
 if [ "$NAME" = "Ubuntu" ]; then
-    env DEBIAN_FRONTEND=noninteractive apt-get -y install mdadm xfsprogs
+    apt-get -y install mdadm xfsprogs
 else
    yum -y install mdadm xfsprogs
 fi
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -8,12 +8,11 @@ if [ "`id -u`" -ne 0 ]; then
 fi

 print_usage() {
-    echo "scylla_setup --disks /dev/hda,/dev/hdb... --nic eth0 --ntp-domain centos --ami --developer-mode --no-enable-service --no-selinux-setup --no-bootparam-setup --no-ntp-setup --no-raid-setup --no-coredump-setup --no-sysconfig-setup"
+    echo "scylla_setup --disks /dev/hda,/dev/hdb... --nic eth0 --ntp-domain centos --ami --no-enable-service --no-selinux-setup --no-bootparam-setup --no-ntp-setup --no-raid-setup --no-coredump-setup --no-sysconfig-setup"
    echo "  --disks			specify disks for RAID"
    echo "  --nic				specify NIC"
    echo "  --ntp-domain			specify NTP domain"
    echo "  --ami				setup AMI instance"
-    echo "  --developer-mode			enable developer mode"
    echo "  --no-enable-service		skip enabling service"
    echo "  --no-selinux-setup		skip selinux setup"
    echo "  --no-bootparam-setup		skip bootparam setup"
@@ -21,7 +20,6 @@ print_usage() {
    echo "  --no-raid-setup		skip raid setup"
    echo "  --no-coredump-setup		skip coredump setup"
    echo "  --no-sysconfig-setup		skip sysconfig setup"
-    echo "  --no-io-setup		skip IO configuration setup"
    exit 1
 }

@@ -42,7 +40,6 @@ interactive_ask_service() {
 }

 AMI=0
-DEV_MODE=0
 ENABLE_SERVICE=1
 SELINUX_SETUP=1
 BOOTPARAM_SETUP=1
@@ -50,7 +47,6 @@ NTP_SETUP=1
 RAID_SETUP=1
 COREDUMP_SETUP=1
 SYSCONFIG_SETUP=1
-IO_SETUP=1

 if [ $# -ne 0 ]; then
    INTERACTIVE=0
@@ -76,10 +72,6 @@ while [ $# -gt 0 ]; do
            AMI=1
            shift 1
            ;;
-        "--developer-mode")
-            DEV_MODE=1
-            shift 1
-            ;;
        "--no-enable-service")
            ENABLE_SERVICE=0
            shift 1
@@ -108,10 +100,6 @@ while [ $# -gt 0 ]; do
            SYSCONFIG_SETUP=0
            shift 1
            ;;
-        "--no-io-setup")
-            IO_SETUP=0
-            shift 1
-            ;;
        "-h" | "--help")
            print_usage
            shift 1
@@ -134,9 +122,9 @@ if [ $INTERACTIVE -eq 1 ]; then
 fi
 if [ $ENABLE_SERVICE -eq 1 ]; then
    if [ "$ID" = "fedora" ] || [ "$ID" = "centos" ]; then
+        systemctl enable scylla-io-setup.service
        systemctl enable scylla-server.service
        systemctl enable scylla-jmx.service
-        systemctl enable collectd.service
    fi
 fi

@@ -174,21 +162,21 @@ if [ $INTERACTIVE -eq 1 ]; then
    if [ $RAID_SETUP -eq 1 ]; then
        echo "Please select disks from following list: "
        while true; do
-            lsblk -d -i -n -r|awk '{print $1}'|sed -e ':loop;N;$!b loop;s/\n/ /g'
+            lsblk -d -i -n -p -r|awk '{print $1}'|sed -e ':loop;N;$!b loop;s/\n/ /g'
            echo "type 'done' to finish selection. selected: $DISKS"
            echo -n "> "
            read dsk
            if [ "$dsk" = "done" ]; then
                break
            fi
-            if [ -e /dev/$dsk ]; then
+            if [ -e $dsk ]; then
                if [ "$DISKS" = "" ]; then
-                    DISKS=/dev/$dsk
+                    DISKS=$dsk
                else
-                    DISKS="$DISKS,/dev/$dsk"
+                    DISKS="$DISKS,$dsk"
                fi
            else
-                echo "/dev/$dsk not found"
+                echo "$dsk not found"
            fi
        done
    fi
@@ -224,18 +212,6 @@ if [ $INTERACTIVE -eq 1 ]; then
        done
    fi
 fi
-
-if [ $INTERACTIVE -eq 1 ]; then
-    interactive_ask_service "Do you want to setup IO configuration?" &&:
-    IO_SETUP=$?
-fi
-if [ $IO_SETUP -eq 1 ]; then
-    /usr/lib/scylla/scylla_io_setup
-fi
-
 if [ $SYSCONFIG_SETUP -eq 1 ]; then
    /usr/lib/scylla/scylla_sysconfig_setup --nic $NIC
 fi
-if [ $DEV_MODE -eq 1 ]; then
-    /usr/lib/scylla/scylla_dev_mode_setup --developer-mode 1
-fi
--- a/dist/common/scylla.d/dev-mode.conf
+++ b/dist/common/scylla.d/dev-mode.conf
@@ -1,4 +0,0 @@
-# DO NO EDIT
-# This file should be automatically configure by scylla_dev_mode_setup
-#
-# DEV_MODE=--developer-mode=0
--- a/dist/common/scylla.d/io.conf
+++ b/dist/common/scylla.d/io.conf
@@ -1,4 +1,4 @@
 # DO NO EDIT
-# This file should be automatically configure by scylla_io_setup
+# This file should be automatically configure by scylla-io-setup.service
 #
 # SEASTAR_IO="--max-io-requests=1 --num-io-queues=1"
--- a/dist/common/sudoers.d/scylla
+++ b/dist/common/sudoers.d/scylla
@@ -1 +1 @@
-scylla ALL=(ALL) NOPASSWD:SETENV: /usr/lib/scylla/scylla_prepare,/usr/lib/scylla/scylla_stop,/usr/lib/scylla/scylla_io_setup,/usr/lib/scylla/scylla-ami/scylla_ami_setup
+scylla ALL=(ALL) NOPASSWD:SETENV: /usr/lib/scylla/scylla_prepare,/usr/lib/scylla/scylla_stop,/usr/lib/scylla/scylla_io_setup
--- a/dist/docker/Dockerfile
+++ b/dist/docker/Dockerfile
@@ -4,7 +4,6 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>

 RUN yum -y install epel-release
 ADD scylla.repo /etc/yum.repos.d/
-RUN yum -y clean expire-cache
 RUN yum -y update
 RUN yum -y remove boost-thread boost-system
 RUN yum -y install scylla-server hostname
--- a/dist/redhat/scylla-server.spec.in
+++ b/dist/redhat/scylla-server.spec.in
@@ -113,9 +113,11 @@ if [ -f /etc/systemd/coredump.conf ];then
    /usr/lib/scylla/scylla_coredump_setup
 fi
 %systemd_post scylla-server.service
+%systemd_post scylla-io-setup.service

 %preun
 %systemd_preun scylla-server.service
+%systemd_preun scylla-io-setup.service

 %postun
 %systemd_postun
@@ -149,6 +151,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_docdir}/scylla/ORIGIN
 %{_docdir}/scylla/licenses/
 %{_unitdir}/scylla-server.service
+%{_unitdir}/scylla-io-setup.service
 %{_bindir}/scylla
 %{_bindir}/iotune
 %{_bindir}/scyllatop
@@ -162,7 +165,6 @@ rm -rf $RPM_BUILD_ROOT
 %{_prefix}/lib/scylla/scylla_ntp_setup
 %{_prefix}/lib/scylla/scylla_selinux_setup
 %{_prefix}/lib/scylla/scylla_io_setup
-%{_prefix}/lib/scylla/scylla_dev_mode_setup
 %{_prefix}/lib/scylla/posix_net_conf.sh
 %{_prefix}/lib/scylla/dpdk_nic_bind.py
 %{_prefix}/lib/scylla/dpdk_nic_bind.pyc
--- a/dist/redhat/systemd/scylla-io-setup.service
+++ b/dist/redhat/systemd/scylla-io-setup.service
@@ -0,0 +1,10 @@
+[Unit]
+Description=Scylla IO Setup
+After=network.target
+
+[Service]
+Type=oneshot
+EnvironmentFile=/etc/sysconfig/scylla-server
+ExecStart=/usr/lib/scylla/scylla_io_setup
+RemainAfterExit=yes
+TimeoutStartSec=1800
--- a/dist/redhat/systemd/scylla-server.service
+++ b/dist/redhat/systemd/scylla-server.service
@@ -1,5 +1,7 @@
 [Unit]
 Description=Scylla Server
+After=scylla-io-setup.service
+Requires=scylla-io-setup.service

 [Service]
 Type=notify
@@ -12,7 +14,7 @@ Environment="HOME=/var/lib/scylla"
 EnvironmentFile=/etc/sysconfig/scylla-server
 EnvironmentFile=/etc/scylla.d/*.conf
 ExecStartPre=/usr/bin/sudo -E /usr/lib/scylla/scylla_prepare
-ExecStart=/usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE
+ExecStart=/usr/bin/scylla $SCYLLA_ARGS $SCYLLA_IO
 ExecStopPost=/usr/bin/sudo -E /usr/lib/scylla/scylla_stop
 TimeoutStartSec=900
 KillMode=process
--- a/dist/ubuntu/build_deb.sh
+++ b/dist/ubuntu/build_deb.sh
@@ -32,7 +32,7 @@ if [ `grep -c $RELEASE dist/ubuntu/supported_release` -lt 1 ]; then
 fi

 VERSION=$(./SCYLLA-VERSION-GEN)
-SCYLLA_VERSION=$(cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/')
+SCYLLA_VERSION=$(cat build/SCYLLA-VERSION-FILE)
 SCYLLA_RELEASE=$(cat build/SCYLLA-RELEASE-FILE)
 echo $VERSION > version
 ./scripts/git-archive-all --extra version --force-submodules --prefix scylla-server ../scylla-server_$SCYLLA_VERSION-$SCYLLA_RELEASE.orig.tar.gz 
--- a/dist/ubuntu/debian/scylla-server.init
+++ b/dist/ubuntu/debian/scylla-server.init
@@ -37,10 +37,8 @@ eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"

 do_start()
 {
-	if [ "$AMI" = "yes" ]; then
-		/usr/lib/scylla/scylla-ami/scylla_ami_setup
- 	fi
 	/usr/lib/scylla/scylla_prepare	
+        /usr/lib/scylla/scylla_io_setup
 	# Return
 	#   0 if daemon has been started
 	#   1 if daemon was already running
--- a/dist/ubuntu/debian/scylla-server.upstart
+++ b/dist/ubuntu/debian/scylla-server.upstart
@@ -26,30 +26,19 @@ env HOME=/var/lib/scylla

 pre-start script
    eval "`grep -v -e "^\s*#" -e "^$" /etc/default/scylla-server|sed -e 's/^/export /'`"
-    . /etc/scylla.d/dev-mode.conf
-    . /etc/scylla.d/io.conf
-    export DEV_MODE
-    export SEASTAR_IO
-    if [ "$AMI" = "yes" ]; then
-        sudo /usr/lib/scylla/scylla-ami/scylla_ami_setup
-    fi
+    eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"
    sudo /usr/lib/scylla/scylla_prepare
+    sudo /usr/lib/scylla/scylla_io_setup
 end script

 script
    eval "`grep -v -e "^\s*#" -e "^$" /etc/default/scylla-server|sed -e 's/^/export /'`"
-    . /etc/scylla.d/dev-mode.conf
-    . /etc/scylla.d/io.conf
-    export DEV_MODE
-    export SEASTAR_IO
-    exec /usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE
+    eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"
+    exec /usr/bin/scylla $SCYLLA_ARGS $SCYLLA_IO
 end script

 post-stop script
    eval "`grep -v -e "^\s*#" -e "^$" /etc/default/scylla-server|sed -e 's/^/export /'`"
-    . /etc/scylla.d/dev-mode.conf
-    . /etc/scylla.d/io.conf
-    export DEV_MODE
-    export SEASTAR_IO
+    eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"
    sudo /usr/lib/scylla/scylla_stop
 end script
--- a/dist/ubuntu/rules.in
+++ b/dist/ubuntu/rules.in
@@ -35,7 +35,7 @@ override_dh_auto_install:
 	cp $(CURDIR)/dist/common/collectd.d/scylla.conf $(COLLECTD)

 	mkdir -p $(SCYLLAD) && \
-	cp $(CURDIR)/dist/common/scylla.d/*.conf $(SCYLLAD)
+	cp $(CURDIR)/dist/common/scylla.d/io.conf $(SCYLLAD)

 	mkdir -p $(CONF) && \
 	cp $(CURDIR)/conf/scylla.yaml $(CONF)
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -160,8 +160,6 @@ void gossiper::do_sort(std::vector<gossip_digest>& g_digest_list) {
    }
 }

-// Depends on
-// - no external dependency
 future<> gossiper::handle_syn_msg(msg_addr from, gossip_digest_syn syn_msg) {
    logger.trace("cluster_name:peer={},local={},partitioner_name:peer={},local={}",
        syn_msg.cluster_id(), get_cluster_name(), syn_msg.partioner(), get_partitioner_name());
@@ -188,12 +186,6 @@ future<> gossiper::handle_syn_msg(msg_addr from, gossip_digest_syn syn_msg) {
    return this->ms().send_gossip_digest_ack(from, std::move(ack_msg));
 }

-// Depends on
-// - failure_detector
-// - on_change callbacks, e.g., storage_service -> access db system_table
-// - on_restart callbacks
-// - on_join callbacks
-// - on_alive
 future<> gossiper::handle_ack_msg(msg_addr id, gossip_digest_ack ack_msg) {
    this->set_last_processed_message_at();
    if (!this->is_enabled() && !this->is_in_shadow_round()) {
@@ -233,39 +225,6 @@ future<> gossiper::handle_ack_msg(msg_addr id, gossip_digest_ack ack_msg) {
    });
 }

-// Depends on
-// - failure_detector
-// - on_change callbacks, e.g., storage_service -> access db system_table
-// - on_restart callbacks
-// - on_join callbacks
-// - on_alive callbacks
-future<> gossiper::handle_ack2_msg(gossip_digest_ack2 msg) {
-    set_last_processed_message_at();
-    if (!is_enabled()) {
-        return make_ready_future<>();
-    }
-    auto& remote_ep_state_map = msg.get_endpoint_state_map();
-    /* Notify the Failure Detector */
-    notify_failure_detector(remote_ep_state_map);
-    return apply_state_locally(remote_ep_state_map);
-}
-
-future<> gossiper::handle_echo_msg() {
-    set_last_processed_message_at();
-    return make_ready_future<>();
-}
-
-future<> gossiper::handle_shutdown_msg(inet_address from) {
-    set_last_processed_message_at();
-    if (!is_enabled()) {
-        logger.debug("Ignoring shutdown message from {} because gossip is disabled", from);
-        return make_ready_future<>();
-    }
-    return seastar::async([this, from] {
-        this->mark_as_shutdown(from);
-    });
-}
-
 void gossiper::init_messaging_service_handler() {
    if (_ms_registered) {
        return;
@@ -293,7 +252,12 @@ void gossiper::init_messaging_service_handler() {
    });
    ms().register_gossip_digest_ack2([] (gossip_digest_ack2 msg) {
        smp::submit_to(0, [msg = std::move(msg)] () mutable {
-            return gms::get_local_gossiper().handle_ack2_msg(std::move(msg));
+            auto& gossiper = gms::get_local_gossiper();
+            gossiper.set_last_processed_message_at();
+            auto& remote_ep_state_map = msg.get_endpoint_state_map();
+            /* Notify the Failure Detector */
+            gossiper.notify_failure_detector(remote_ep_state_map);
+            return gossiper.apply_state_locally(remote_ep_state_map);
        }).handle_exception([] (auto ep) {
            logger.warn("Fail to handle GOSSIP_DIGEST_ACK2: {}", ep);
        });
@@ -301,12 +265,22 @@ void gossiper::init_messaging_service_handler() {
    });
    ms().register_gossip_echo([] {
        return smp::submit_to(0, [] {
-            return gms::get_local_gossiper().handle_echo_msg();
+            auto& gossiper = gms::get_local_gossiper();
+            gossiper.set_last_processed_message_at();
+            return make_ready_future<>();
        });
    });
    ms().register_gossip_shutdown([] (inet_address from) {
        smp::submit_to(0, [from] {
-            return gms::get_local_gossiper().handle_shutdown_msg(from);
+            auto& gossiper = gms::get_local_gossiper();
+            gossiper.set_last_processed_message_at();
+            if (!gossiper.is_enabled()) {
+                logger.debug("Ignoring shutdown message from {} because gossip is disabled", from);
+                return make_ready_future<>();
+            }
+            return seastar::async([from] {
+                gms::get_local_gossiper().mark_as_shutdown(from);
+            });
        }).handle_exception([] (auto ep) {
            logger.warn("Fail to handle GOSSIP_SHUTDOWN: {}", ep);
        });
@@ -514,135 +488,130 @@ void gossiper::do_status_check() {
    }
 }

-// Depends on:
-// - failure_detector
-// - on_remove callbacks, e.g, storage_service -> access token_metadata
 void gossiper::run() {
-    timer_callback_lock().then([this, g = this->shared_from_this()] {
-        seastar::async([this, g] {
-            logger.trace("=== Gossip round START");
+    _callback_running = seastar::async([this, g = this->shared_from_this()] {
+        logger.trace("=== Gossip round START");

-            //wait on messaging service to start listening
-            // MessagingService.instance().waitUntilListening();
+        //wait on messaging service to start listening
+        // MessagingService.instance().waitUntilListening();

-            /* Update the local heartbeat counter. */
-            auto br_addr = get_broadcast_address();
-            heart_beat_state& hbs = endpoint_state_map[br_addr].get_heart_beat_state();
-            hbs.update_heart_beat();
+        /* Update the local heartbeat counter. */
+        auto br_addr = get_broadcast_address();
+        heart_beat_state& hbs = endpoint_state_map[br_addr].get_heart_beat_state();
+        hbs.update_heart_beat();

-            //
-            // We don't care about heart_beat change on other CPUs - so ingnore this
-            // specific change.
-            //
-            shadow_endpoint_state_map[br_addr].set_heart_beat_state(hbs);
+        //
+        // We don't care about heart_beat change on other CPUs - so ingnore this
+        // specific change.
+        //
+        _shadow_endpoint_state_map[br_addr].set_heart_beat_state(hbs);

-            logger.trace("My heartbeat is now {}", endpoint_state_map[br_addr].get_heart_beat_state().get_heart_beat_version());
-            std::vector<gossip_digest> g_digests;
-            this->make_random_gossip_digest(g_digests);
+        logger.trace("My heartbeat is now {}", endpoint_state_map[br_addr].get_heart_beat_state().get_heart_beat_version());
+        std::vector<gossip_digest> g_digests;
+        this->make_random_gossip_digest(g_digests);

-            if (g_digests.size() > 0) {
-                gossip_digest_syn message(get_cluster_name(), get_partitioner_name(), g_digests);
+        if (g_digests.size() > 0) {
+            gossip_digest_syn message(get_cluster_name(), get_partitioner_name(), g_digests);

-                _gossiped_to_seed = false;
+            _gossiped_to_seed = false;

-                /* Gossip to some random live member */
-                do_gossip_to_live_member(message).handle_exception([] (auto ep) {
-                    logger.trace("Faill to do_gossip_to_live_member: {}", ep);
+            /* Gossip to some random live member */
+            do_gossip_to_live_member(message).handle_exception([] (auto ep) {
+                logger.trace("Faill to do_gossip_to_live_member: {}", ep);
+            });
+
+            /* Gossip to some unreachable member with some probability to check if he is back up */
+            do_gossip_to_unreachable_member(message).handle_exception([] (auto ep) {
+                logger.trace("Faill to do_gossip_to_unreachable_member: {}", ep);
+            });
+
+            /* Gossip to a seed if we did not do so above, or we have seen less nodes
+               than there are seeds.  This prevents partitions where each group of nodes
+               is only gossiping to a subset of the seeds.
+
+               The most straightforward check would be to check that all the seeds have been
+               verified either as live or unreachable.  To avoid that computation each round,
+               we reason that:
+
+               either all the live nodes are seeds, in which case non-seeds that come online
+               will introduce themselves to a member of the ring by definition,
+
+               or there is at least one non-seed node in the list, in which case eventually
+               someone will gossip to it, and then do a gossip to a random seed from the
+               gossipedToSeed check.
+
+               See CASSANDRA-150 for more exposition. */
+            logger.trace("gossiped_to_seed={}, _live_endpoints.size={}, _seeds.size={}",
+                         _gossiped_to_seed, _live_endpoints.size(), _seeds.size());
+            if (!_gossiped_to_seed || _live_endpoints.size() < _seeds.size()) {
+                do_gossip_to_seed(message).handle_exception([] (auto ep) {
+                    logger.trace("Faill to do_gossip_to_seed: {}", ep);
                });
-
-                /* Gossip to some unreachable member with some probability to check if he is back up */
-                do_gossip_to_unreachable_member(message).handle_exception([] (auto ep) {
-                    logger.trace("Faill to do_gossip_to_unreachable_member: {}", ep);
-                });
-
-                /* Gossip to a seed if we did not do so above, or we have seen less nodes
-                   than there are seeds.  This prevents partitions where each group of nodes
-                   is only gossiping to a subset of the seeds.
-
-                   The most straightforward check would be to check that all the seeds have been
-                   verified either as live or unreachable.  To avoid that computation each round,
-                   we reason that:
-
-                   either all the live nodes are seeds, in which case non-seeds that come online
-                   will introduce themselves to a member of the ring by definition,
-
-                   or there is at least one non-seed node in the list, in which case eventually
-                   someone will gossip to it, and then do a gossip to a random seed from the
-                   gossipedToSeed check.
-
-                   See CASSANDRA-150 for more exposition. */
-                logger.trace("gossiped_to_seed={}, _live_endpoints.size={}, _seeds.size={}",
-                             _gossiped_to_seed, _live_endpoints.size(), _seeds.size());
-                if (!_gossiped_to_seed || _live_endpoints.size() < _seeds.size()) {
-                    do_gossip_to_seed(message).handle_exception([] (auto ep) {
-                        logger.trace("Faill to do_gossip_to_seed: {}", ep);
-                    });
-                }
-
-                do_status_check();
            }

-            //
-            // Gossiper task runs only on CPU0:
-            //
-            //    - If endpoint_state_map or _live_endpoints have changed - duplicate
-            //      them across all other shards.
-            //    - Reschedule the gossiper only after execution on all nodes is done.
-            //
-            bool endpoint_map_changed = (shadow_endpoint_state_map != endpoint_state_map);
-            bool live_endpoint_changed = (_live_endpoints != _shadow_live_endpoints);
-            bool unreachable_endpoint_changed = (_unreachable_endpoints != _shadow_unreachable_endpoints);
+            do_status_check();
+        }

-            if (endpoint_map_changed || live_endpoint_changed || unreachable_endpoint_changed) {
-                if (endpoint_map_changed) {
-                    shadow_endpoint_state_map = endpoint_state_map;
-                }
+        //
+        // Gossiper task runs only on CPU0:
+        //
+        //    - If endpoint_state_map or _live_endpoints have changed - duplicate
+        //      them across all other shards.
+        //    - Reschedule the gossiper only after execution on all nodes is done.
+        //
+        bool endpoint_map_changed = (_shadow_endpoint_state_map != endpoint_state_map);
+        bool live_endpoint_changed = (_live_endpoints != _shadow_live_endpoints);
+        bool unreachable_endpoint_changed = (_unreachable_endpoints != _shadow_unreachable_endpoints);

-                if (live_endpoint_changed) {
-                    _shadow_live_endpoints = _live_endpoints;
-                }
+        if (endpoint_map_changed || live_endpoint_changed || unreachable_endpoint_changed) {
+            if (endpoint_map_changed) {
+                _shadow_endpoint_state_map = endpoint_state_map;
+            }

-                if (unreachable_endpoint_changed) {
-                    _shadow_unreachable_endpoints = _unreachable_endpoints;
-                }
+            if (live_endpoint_changed) {
+                _shadow_live_endpoints = _live_endpoints;
+            }

-                _the_gossiper.invoke_on_all([this, endpoint_map_changed,
-                    live_endpoint_changed, unreachable_endpoint_changed] (gossiper& local_gossiper) {
-                    // Don't copy gossiper(CPU0) maps into themselves!
-                    if (engine().cpu_id() != 0) {
-                        if (endpoint_map_changed) {
-                            local_gossiper.endpoint_state_map = shadow_endpoint_state_map;
-                        }
+            if (unreachable_endpoint_changed) {
+                _shadow_unreachable_endpoints = _unreachable_endpoints;
+            }

-                        if (live_endpoint_changed) {
-                            local_gossiper._live_endpoints = _shadow_live_endpoints;
-                        }
-
-                        if (unreachable_endpoint_changed) {
-                            local_gossiper._unreachable_endpoints = _shadow_unreachable_endpoints;
-                        }
+            _the_gossiper.invoke_on_all([this, endpoint_map_changed,
+                live_endpoint_changed, unreachable_endpoint_changed] (gossiper& local_gossiper) {
+                // Don't copy gossiper(CPU0) maps into themselves!
+                if (engine().cpu_id() != 0) {
+                    if (endpoint_map_changed) {
+                        local_gossiper.endpoint_state_map = _shadow_endpoint_state_map;
                    }
-                }).get();
-            }
-        }).then_wrapped([this] (auto&& f) {
-            try {
-                f.get();
-                _nr_run++;
-                logger.trace("=== Gossip round OK");
-            } catch (...) {
-                logger.trace("=== Gossip round FAIL");
-            }

-            if (logger.is_enabled(logging::log_level::trace)) {
-                for (auto& x : endpoint_state_map) {
-                    logger.trace("ep={}, eps={}", x.first, x.second);
+                    if (live_endpoint_changed) {
+                        local_gossiper._live_endpoints = _shadow_live_endpoints;
+                    }
+
+                    if (unreachable_endpoint_changed) {
+                        local_gossiper._unreachable_endpoints = _shadow_unreachable_endpoints;
+                    }
                }
+            }).get();
+        }
+    }).then_wrapped([this] (auto&& f) {
+        try {
+            f.get();
+            _nr_run++;
+            logger.trace("=== Gossip round OK");
+        } catch (...) {
+            logger.trace("=== Gossip round FAIL");
+        }
+
+        if (logger.is_enabled(logging::log_level::trace)) {
+            for (auto& x : endpoint_state_map) {
+                logger.trace("ep={}, eps={}", x.first, x.second);
            }
-            if (_enabled) {
-                _scheduled_gossip_task.arm(INTERVAL);
-            }
-            this->timer_callback_unlock();
-        });
+        }
+        if (_enabled) {
+            _scheduled_gossip_task.arm(INTERVAL);
+        }
+        return make_ready_future<>();
    });
 }

@@ -709,10 +678,6 @@ int64_t gossiper::get_endpoint_downtime(inet_address ep) {
    }
 }

-// Depends on
-// - on_dead callbacks
-// It is called from failure_detector
-//
 // Runs inside seastar::async context
 void gossiper::convict(inet_address endpoint, double phi) {
    auto it = endpoint_state_map.find(endpoint);
@@ -1455,9 +1420,6 @@ void gossiper::add_saved_endpoint(inet_address ep) {
    logger.trace("Adding saved endpoint {} {}", ep, ep_state.get_heart_beat_state().get_generation());
 }

-// Depends on:
-// - before_change callbacks
-// - on_change callbacks
 future<> gossiper::add_local_application_state(application_state state, versioned_value value) {
    return get_gossiper().invoke_on(0, [state, value = std::move(value)] (auto& gossiper) mutable {
        return seastar::async([&gossiper, g = gossiper.shared_from_this(), state, value = std::move(value)] () mutable {
@@ -1485,16 +1447,9 @@ future<> gossiper::add_local_application_state(application_state state, versione
 }

 future<> gossiper::do_stop_gossiping() {
-    if (!is_enabled()) {
-        logger.info("gossip is already stopped");
-        return make_ready_future<>();
-    }
    return seastar::async([this, g = this->shared_from_this()] {
        _enabled = false;
        auto my_ep_state = get_endpoint_state_for_endpoint(get_broadcast_address());
-        if (my_ep_state) {
-            logger.info("My status = {}", get_gossip_status(*my_ep_state));
-        }
        if (my_ep_state && !is_silent_shutdown_state(*my_ep_state)) {
            logger.info("Announcing shutdown");
            add_local_application_state(application_state::STATUS, storage_service_value_factory().shutdown(true)).get();
@@ -1517,13 +1472,7 @@ future<> gossiper::do_stop_gossiping() {
            logger.warn("No local state or state is in silent shutdown, not announcing shutdown");
        }
        _scheduled_gossip_task.cancel();
-        timer_callback_lock().get();
-        //
-        // Release the timer semaphore since storage_proxy may be waiting for
-        // it.
-        // Gossiper timer is promised to be neither running nor scheduled.
-        //
-        timer_callback_unlock();
+        _callback_running.get();
        get_gossiper().invoke_on_all([] (gossiper& g) {
            if (engine().cpu_id() == 0) {
                get_local_failure_detector().unregister_failure_detection_event_listener(&g);
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -90,9 +90,6 @@ private:
    void uninit_messaging_service_handler();
    future<> handle_syn_msg(msg_addr from, gossip_digest_syn syn_msg);
    future<> handle_ack_msg(msg_addr from, gossip_digest_ack ack_msg);
-    future<> handle_ack2_msg(gossip_digest_ack2 msg);
-    future<> handle_echo_msg();
-    future<> handle_shutdown_msg(inet_address from);
    static constexpr uint32_t _default_cpuid = 0;
    msg_addr get_msg_addr(inet_address to) {
        return msg_addr{to, _default_cpuid};
@@ -102,10 +99,8 @@ private:
    bool _enabled = false;
    std::set<inet_address> _seeds_from_config;
    sstring _cluster_name;
-    semaphore _callback_running{1};
+    future<> _callback_running = make_ready_future<>();
 public:
-    future<> timer_callback_lock() { return _callback_running.wait(); }
-    void timer_callback_unlock() { _callback_running.signal(); }
    sstring get_cluster_name();
    sstring get_partitioner_name();
    inet_address get_broadcast_address() {
@@ -119,7 +114,6 @@ public:
 public:
    /* map where key is the endpoint and value is the state associated with the endpoint */
    std::unordered_map<inet_address, endpoint_state> endpoint_state_map;
-    std::unordered_map<inet_address, endpoint_state> shadow_endpoint_state_map;

    const std::vector<sstring> DEAD_STATES = {
        versioned_value::REMOVING_TOKEN,
@@ -203,6 +197,7 @@ private:

    clk::time_point _last_processed_message_at = now();

+    std::unordered_map<inet_address, endpoint_state> _shadow_endpoint_state_map;
    std::map<inet_address, clk::time_point> _shadow_unreachable_endpoints;
    std::set<inet_address> _shadow_live_endpoints;

--- a/hashing_partition_visitor.hh
+++ b/hashing_partition_visitor.hh
@@ -55,7 +55,7 @@ public:
        auto&& col = _s.static_column_at(id);
        feed_hash(_h, col.name());
        feed_hash(_h, col.type->name());
-        feed_hash(_h, cell);
+        feed_hash(cell, _h, col.type);
    }

    virtual void accept_row_tombstone(clustering_key_prefix_view prefix, tombstone t) {
@@ -80,6 +80,6 @@ public:
        auto&& col = _s.regular_column_at(id);
        feed_hash(_h, col.name());
        feed_hash(_h, col.type->name());
-        feed_hash(_h, cell);
+        feed_hash(cell, _h, col.type);
    }
 };
--- a/idl/gossip_digest.idl.hh
+++ b/idl/gossip_digest.idl.hh
@@ -20,8 +20,7 @@
 */

 namespace gms {
-enum class application_state:int {
-        STATUS = 0,
+enum class application_state:int {STATUS = 0,
        LOAD,
        SCHEMA,
        DC,
@@ -30,7 +29,6 @@ enum class application_state:int {
        REMOVAL_COORDINATOR,
        INTERNAL_IP,
        RPC_ADDRESS,
-        X_11_PADDING,
        SEVERITY,
        NET_VERSION,
        HOST_ID,
--- a/idl/result.idl.hh
+++ b/idl/result.idl.hh
@@ -20,14 +20,10 @@
 */

 namespace query {
-
+class result final {
+    bytes_ostream buf();
+};
 class result_digest final {
    std::array<uint8_t, 16> get();
 };
-
-class result {
-    bytes_ostream buf();
-    std::experimental::optional<query::result_digest> digest();
-};
-
 }
--- a/keys.hh
+++ b/keys.hh
@@ -27,7 +27,6 @@
 #include "compound_compat.hh"
 #include "utils/managed_bytes.hh"
 #include "hashing.hh"
-#include "database_fwd.hh"

 //
 // This header defines type system for primary key holders.
@@ -51,6 +50,13 @@
 // is not stored. Therefore accessors need to be provided with a pointer to
 // schema, from which information about structure is extracted.

+class partition_key;
+class partition_key_view;
+class clustering_key_prefix;
+class clustering_key_prefix_view;
+using clustering_key = clustering_key_prefix;
+using clustering_key_view = clustering_key_prefix_view;
+
 // Abstracts a view to serialized compound.
 template <typename TopLevelView>
 class compound_view_wrapper {
--- a/main.cc
+++ b/main.cc
@@ -366,10 +366,6 @@ int main(int ac, char** av) {
            // Note: changed from using a move here, because we want the config object intact.
            db.start(std::ref(*cfg)).get();
            engine().at_exit([&db] {
-                // A shared sstable must be compacted by all shards before it can be deleted.
-                // Since we're stoping, that's not going to happen.  Cancel those pending
-                // deletions to let anyone waiting on them to continue.
-                sstables::cancel_atomic_deletions();
                // #293 - do not stop anything - not even db (for real)
                //return db.stop();
                // call stop on each db instance, but leave the shareded<database> pointers alive.
--- a/md5_hasher.hh
+++ b/md5_hasher.hh
@@ -40,10 +40,4 @@ public:
        hash.Final(reinterpret_cast<unsigned char*>(digest.begin()));
        return digest;
    }
-
-    std::array<uint8_t, CryptoPP::Weak::MD5::DIGESTSIZE> finalize_array() {
-        std::array<uint8_t, CryptoPP::Weak::MD5::DIGESTSIZE> array;
-        hash.Final(reinterpret_cast<unsigned char*>(array.data()));
-        return array;
-    }
 };
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -28,7 +28,6 @@
 #include "gms/gossip_digest_syn.hh"
 #include "gms/gossip_digest_ack.hh"
 #include "gms/gossip_digest_ack2.hh"
-#include "gms/gossiper.hh"
 #include "query-request.hh"
 #include "query-result.hh"
 #include "rpc/rpc.hh"
@@ -360,7 +359,6 @@ void messaging_service::cache_preferred_ip(gms::inet_address ep, gms::inet_addre
 }

 shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::get_rpc_client(messaging_verb verb, msg_addr id) {
-    assert(!_stopping);
    auto idx = get_rpc_client_idx(verb);
    auto it = _clients[idx].find(id);

@@ -410,13 +408,6 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
 }

 void messaging_service::remove_rpc_client_one(clients_map& clients, msg_addr id, bool dead_only) {
-    if (_stopping) {
-        // if messaging service is in a processed of been stopped no need to
-        // stop and remove connection here since they are being stopped already
-        // and we'll just interfere
-        return;
-    }
-
    auto it = clients.find(id);
    if (it != clients.end() && (!dead_only || it->second.rpc_client->error())) {
        auto client = std::move(it->second.rpc_client);
@@ -450,12 +441,8 @@ std::unique_ptr<messaging_service::rpc_protocol_wrapper>& messaging_service::rpc
 // Send a message for verb
 template <typename MsgIn, typename... MsgOut>
 auto send_message(messaging_service* ms, messaging_verb verb, msg_addr id, MsgOut&&... msg) {
-    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
-    if (ms->is_stopping()) {
-        using futurator = futurize<std::result_of_t<decltype(rpc_handler)(rpc_protocol::client&, MsgOut...)>>;
-        return futurator::make_exception_future(rpc::closed_error());
-    }
    auto rpc_client_ptr = ms->get_rpc_client(verb, id);
+    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
    auto& rpc_client = *rpc_client_ptr;
    return rpc_handler(rpc_client, std::forward<MsgOut>(msg)...).then_wrapped([ms = ms->shared_from_this(), id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (auto&& f) {
        try {
@@ -479,12 +466,8 @@ auto send_message(messaging_service* ms, messaging_verb verb, msg_addr id, MsgOu
 // TODO: Remove duplicated code in send_message
 template <typename MsgIn, typename Timeout, typename... MsgOut>
 auto send_message_timeout(messaging_service* ms, messaging_verb verb, msg_addr id, Timeout timeout, MsgOut&&... msg) {
-    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
-    if (ms->is_stopping()) {
-        using futurator = futurize<std::result_of_t<decltype(rpc_handler)(rpc_protocol::client&, MsgOut...)>>;
-        return futurator::make_exception_future(rpc::closed_error());
-    }
    auto rpc_client_ptr = ms->get_rpc_client(verb, id);
+    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
    auto& rpc_client = *rpc_client_ptr;
    return rpc_handler(rpc_client, timeout, std::forward<MsgOut>(msg)...).then_wrapped([ms = ms->shared_from_this(), id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (auto&& f) {
        try {
@@ -514,33 +497,18 @@ auto send_message_timeout_and_retry(messaging_service* ms, messaging_verb verb,
        return repeat_until_value([ms, verb, id, timeout, wait, nr_retry, &retry, &messages...] {
            return send_message_timeout<MsgIn>(ms, verb, id, timeout, messages...).then_wrapped(
                    [ms, verb, id, timeout, wait, nr_retry, &retry] (auto&& f) mutable {
-                auto vb = int(verb);
                try {
                    MsgInTuple ret = f.get();
                    if (retry != nr_retry) {
-                        logger.info("Retry verb={} to {}, retry={}: OK", vb, id, retry);
+                        logger.info("Retry verb={} to {}, retry={}: OK", int(verb), id, retry);
                    }
                    return make_ready_future<stdx::optional<MsgInTuple>>(std::move(ret));
                } catch (rpc::timeout_error) {
-                    logger.info("Retry verb={} to {}, retry={}: timeout in {} seconds", vb, id, retry, timeout.count());
+                    logger.info("Retry verb={} to {}, retry={}: timeout in {} seconds", int(verb), id, retry, timeout.count());
                    throw;
                } catch (rpc::closed_error) {
-                    logger.info("Retry verb={} to {}, retry={}: {}", vb, id, retry, std::current_exception());
-                    // Stop retrying if retry reaches 0 or message service is shutdown
-                    // or the remote node is removed from gossip (on_remove())
-                    retry--;
-                    if (retry == 0) {
-                        logger.debug("Retry verb={} to {}, retry={}: stop retrying: retry == 0", vb, id, retry);
-                        throw;
-                    }
-                    if (ms->is_stopping()) {
-                        logger.debug("Retry verb={} to {}, retry={}: stop retrying: messaging_service is stopped",
-                                     vb, id, retry);
-                        throw;
-                    }
-                    if (!gms::get_local_gossiper().is_known_endpoint(id.addr)) {
-                        logger.debug("Retry verb={} to {}, retry={}: stop retrying: node is removed from the cluster",
-                                     vb, id, retry);
+                    logger.info("Retry verb={} to {}, retry={}: {}", int(verb), id, retry, std::current_exception());
+                    if (--retry == 0 || ms->is_stopping()) {
                        throw;
                    }
                    return sleep(wait).then([] {
@@ -550,7 +518,7 @@ auto send_message_timeout_and_retry(messaging_service* ms, messaging_verb verb,
                    throw;
                }
            });
-        }).then([ms = ms->shared_from_this()] (MsgInTuple result) {
+        }).then([] (MsgInTuple result) {
            return futurize<MsgIn>::from_tuple(std::move(result));
        });
    });
@@ -732,8 +700,8 @@ void messaging_service::register_read_data(std::function<future<foreign_ptr<lw_s
 void messaging_service::unregister_read_data() {
    _rpc->unregister_handler(net::messaging_verb::READ_DATA);
 }
-future<query::result> messaging_service::send_read_data(msg_addr id, clock_type::time_point timeout, const query::read_command& cmd, const query::partition_range& pr) {
-    return send_message_timeout<query::result>(this, messaging_verb::READ_DATA, std::move(id), timeout, cmd, pr);
+future<query::result> messaging_service::send_read_data(msg_addr id, const query::read_command& cmd, const query::partition_range& pr) {
+    return send_message<query::result>(this, messaging_verb::READ_DATA, std::move(id), cmd, pr);
 }

 void messaging_service::register_get_schema_version(std::function<future<frozen_schema>(unsigned, table_schema_version)>&& func) {
@@ -762,8 +730,8 @@ void messaging_service::register_read_mutation_data(std::function<future<foreign
 void messaging_service::unregister_read_mutation_data() {
    _rpc->unregister_handler(net::messaging_verb::READ_MUTATION_DATA);
 }
-future<reconcilable_result> messaging_service::send_read_mutation_data(msg_addr id, clock_type::time_point timeout, const query::read_command& cmd, const query::partition_range& pr) {
-    return send_message_timeout<reconcilable_result>(this, messaging_verb::READ_MUTATION_DATA, std::move(id), timeout, cmd, pr);
+future<reconcilable_result> messaging_service::send_read_mutation_data(msg_addr id, const query::read_command& cmd, const query::partition_range& pr) {
+    return send_message<reconcilable_result>(this, messaging_verb::READ_MUTATION_DATA, std::move(id), cmd, pr);
 }

 void messaging_service::register_read_digest(std::function<future<query::result_digest> (const rpc::client_info&, query::read_command cmd, query::partition_range pr)>&& func) {
@@ -772,8 +740,8 @@ void messaging_service::register_read_digest(std::function<future<query::result_
 void messaging_service::unregister_read_digest() {
    _rpc->unregister_handler(net::messaging_verb::READ_DIGEST);
 }
-future<query::result_digest> messaging_service::send_read_digest(msg_addr id, clock_type::time_point timeout, const query::read_command& cmd, const query::partition_range& pr) {
-    return send_message_timeout<query::result_digest>(this, net::messaging_verb::READ_DIGEST, std::move(id), timeout, cmd, pr);
+future<query::result_digest> messaging_service::send_read_digest(msg_addr id, const query::read_command& cmd, const query::partition_range& pr) {
+    return send_message<query::result_digest>(this, net::messaging_verb::READ_DIGEST, std::move(id), cmd, pr);
 }

 // Wrapper for TRUNCATE
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -277,7 +277,7 @@ public:
    // Note: WTH is future<foreign_ptr<lw_shared_ptr<query::result>>
    void register_read_data(std::function<future<foreign_ptr<lw_shared_ptr<query::result>>> (const rpc::client_info&, query::read_command cmd, query::partition_range pr)>&& func);
    void unregister_read_data();
-    future<query::result> send_read_data(msg_addr id, clock_type::time_point timeout, const query::read_command& cmd, const query::partition_range& pr);
+    future<query::result> send_read_data(msg_addr id, const query::read_command& cmd, const query::partition_range& pr);

    // Wrapper for GET_SCHEMA_VERSION
    void register_get_schema_version(std::function<future<frozen_schema>(unsigned, table_schema_version)>&& func);
@@ -292,12 +292,12 @@ public:
    // Wrapper for READ_MUTATION_DATA
    void register_read_mutation_data(std::function<future<foreign_ptr<lw_shared_ptr<reconcilable_result>>> (const rpc::client_info&, query::read_command cmd, query::partition_range pr)>&& func);
    void unregister_read_mutation_data();
-    future<reconcilable_result> send_read_mutation_data(msg_addr id, clock_type::time_point timeout, const query::read_command& cmd, const query::partition_range& pr);
+    future<reconcilable_result> send_read_mutation_data(msg_addr id, const query::read_command& cmd, const query::partition_range& pr);

    // Wrapper for READ_DIGEST
    void register_read_digest(std::function<future<query::result_digest> (const rpc::client_info&, query::read_command cmd, query::partition_range pr)>&& func);
    void unregister_read_digest();
-    future<query::result_digest> send_read_digest(msg_addr id, clock_type::time_point timeout, const query::read_command& cmd, const query::partition_range& pr);
+    future<query::result_digest> send_read_digest(msg_addr id, const query::read_command& cmd, const query::partition_range& pr);

    // Wrapper for TRUNCATE
    void register_truncate(std::function<future<>(sstring, sstring)>&& func);
@@ -329,4 +329,5 @@ inline messaging_service& get_local_messaging_service() {
    return _the_messaging_service.local();
 }

+future<> init_messaging_service(sstring listen_address, db::seed_provider_type seed_provider);
 } // namespace net
--- a/mutation.cc
+++ b/mutation.cc
@@ -126,37 +126,14 @@ bool mutation::operator!=(const mutation& m) const {
    return !(*this == m);
 }

-void
-mutation::query(query::result::builder& builder,
-    const query::partition_slice& slice,
-    gc_clock::time_point now,
-    uint32_t row_limit) &&
-{
+query::result
+mutation::query(const query::partition_slice& slice, gc_clock::time_point now, uint32_t row_limit) const {
+    query::result::builder builder(slice);
    auto pb = builder.add_partition(*schema(), key());
-    auto is_reversed = slice.options.contains<query::partition_slice::option::reversed>();
-    mutation_partition& p = partition();
-    p.compact_for_query(*schema(), now, slice.row_ranges(*schema(), key()), is_reversed, row_limit);
-    p.query_compacted(pb, *schema(), row_limit);
-}
-
-query::result
-mutation::query(const query::partition_slice& slice,
-    query::result_request request,
-    gc_clock::time_point now, uint32_t row_limit) &&
-{
-    query::result::builder builder(slice, request);
-    std::move(*this).query(builder, slice, now, row_limit);
+    partition().query(pb, *schema(), now, row_limit);
    return builder.build();
 }

-query::result
-mutation::query(const query::partition_slice& slice,
-    query::result_request request,
-    gc_clock::time_point now, uint32_t row_limit) const&
-{
-    return mutation(*this).query(slice, request, now, row_limit);
-}
-
 size_t
 mutation::live_row_count(gc_clock::time_point query_time) const {
    return partition().live_row_count(*schema(), query_time);
@@ -207,7 +184,3 @@ void mutation::apply(mutation&& m) {
 void mutation::apply(const mutation& m) {
    partition().apply(*schema(), m.partition(), *m.schema());
 }
-
-mutation& mutation::operator=(const mutation& m) {
-    return *this = mutation(m);
-}
--- a/mutation.hh
+++ b/mutation.hh
@@ -60,9 +60,9 @@ public:
    mutation(const mutation& m)
        : _ptr(std::make_unique<data>(schema_ptr(m.schema()), dht::decorated_key(m.decorated_key()), m.partition()))
    { }
+
    mutation(mutation&&) = default;
    mutation& operator=(mutation&& x) = default;
-    mutation& operator=(const mutation& m);

    void set_static_cell(const column_definition& def, atomic_cell_or_collection&& value);
    void set_static_cell(const bytes& name, const data_value& value, api::timestamp_type timestamp, ttl_opt ttl = {});
@@ -104,23 +104,7 @@ public:
    bool operator!=(const mutation&) const;
 public:
    // The supplied partition_slice must be governed by this mutation's schema
-    query::result query(const query::partition_slice&,
-        query::result_request request = query::result_request::only_result,
-        gc_clock::time_point now = gc_clock::now(),
-        uint32_t row_limit = query::max_rows) &&;
-
-    // The supplied partition_slice must be governed by this mutation's schema
-    // FIXME: Slower than the r-value version
-    query::result query(const query::partition_slice&,
-        query::result_request request = query::result_request::only_result,
-        gc_clock::time_point now = gc_clock::now(),
-        uint32_t row_limit = query::max_rows) const&;
-
-    // The supplied partition_slice must be governed by this mutation's schema
-    void query(query::result::builder& builder,
-        const query::partition_slice& slice,
-        gc_clock::time_point now = gc_clock::now(),
-        uint32_t row_limit = query::max_rows) &&;
+    query::result query(const query::partition_slice&, gc_clock::time_point now = gc_clock::now(), uint32_t row_limit = query::max_rows) const;

    // See mutation_partition::live_row_count()
    size_t live_row_count(gc_clock::time_point query_time = gc_clock::time_point::min()) const;
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -20,14 +20,11 @@
 */

 #include <boost/range/adaptor/reversed.hpp>
-#include <seastar/util/defer.hh>
 #include "mutation_partition.hh"
 #include "mutation_partition_applier.hh"
 #include "converting_mutation_partition_applier.hh"
 #include "partition_builder.hh"
 #include "query-result-writer.hh"
-#include "atomic_cell_hash.hh"
-#include "reversibly_mergeable.hh"

 template<bool reversed>
 struct reversal_traits;
@@ -59,11 +56,6 @@ struct reversal_traits<false> {
    {
        return r;
    }
-
-    template <typename Container>
-    static typename Container::iterator maybe_reverse(Container&, typename Container::iterator r) {
-        return r;
-    }
 };

 template<>
@@ -96,116 +88,8 @@ struct reversal_traits<true> {
        using reverse_iterator = typename Container::reverse_iterator;
        return boost::make_iterator_range(reverse_iterator(r.end()), reverse_iterator(r.begin()));
    }
-
-    template <typename Container>
-    static typename Container::reverse_iterator maybe_reverse(Container&, typename Container::iterator r) {
-        return typename Container::reverse_iterator(r);
-    }
 };

-
-//
-// apply_reversibly_intrusive_set() and revert_intrusive_set() implement ReversiblyMergeable
-// for a boost::intrusive_set<> container of ReversiblyMergeable entries.
-//
-// See reversibly_mergeable.hh
-//
-// Requirements:
-//  - entry has distinct key and value states
-//  - entries are ordered only by key in the container
-//  - entry can have an empty value
-//  - presence of an entry with an empty value doesn't affect equality of the containers
-//  - E::empty() returns true iff the value is empty
-//  - E(e.key()) creates an entry with empty value but the same key as that of e.
-//
-// Implementation of ReversiblyMergeable for the entry's value is provided via Apply and Revert functors.
-//
-// ReversiblyMergeable is constructed assuming the following properties of the 'apply' operation
-// on containers:
-//
-//  apply([{k1, v1}], [{k1, v2}]) = [{k1, apply(v1, v2)}]
-//  apply([{k1, v1}], [{k2, v2}]) = [{k1, v1}, {k2, v2}]
-//
-
-// revert for apply_reversibly_intrusive_set()
-template<typename Container, typename Revert = default_reverter<typename Container::value_type>>
-void revert_intrusive_set_range(Container& dst, Container& src,
-    typename Container::iterator start,
-    typename Container::iterator end,
-    Revert&& revert = Revert()) noexcept
-{
-    using value_type = typename Container::value_type;
-    auto deleter = current_deleter<value_type>();
-    while (start != end) {
-        auto& e = *start;
-        // lower_bound() can allocate if linearization is required but it should have
-        // been already performed by the lower_bound() invocation in apply_reversibly_intrusive_set() and
-        // stored in the linearization context.
-        auto i = dst.find(e);
-        assert(i != dst.end());
-        value_type& dst_e = *i;
-
-        if (e.empty()) {
-            dst.erase(i);
-            start = src.erase_and_dispose(start, deleter);
-            start = src.insert_before(start, dst_e);
-        } else {
-            revert(dst_e, e);
-        }
-
-        ++start;
-    }
-}
-
-template<typename Container, typename Revert = default_reverter<typename Container::value_type>>
-void revert_intrusive_set(Container& dst, Container& src, Revert&& revert = Revert()) noexcept {
-    revert_intrusive_set_range(dst, src, src.begin(), src.end(), std::forward<Revert>(revert));
-}
-
-// Applies src onto dst. See comment above revert_intrusive_set_range() for more details.
-//
-// Returns an object which upon going out of scope, unless cancel() is called on it,
-// reverts the applicaiton by calling revert_intrusive_set(). The references to containers
-// must be stable as long as the returned object is live.
-template<typename Container,
-        typename Apply = default_reversible_applier<typename Container::value_type>,
-        typename Revert = default_reverter<typename Container::value_type>>
-auto apply_reversibly_intrusive_set(Container& dst, Container& src, Apply&& apply = Apply(), Revert&& revert = Revert()) {
-    using value_type = typename Container::value_type;
-    auto src_i = src.begin();
-    try {
-        while (src_i != src.end()) {
-            value_type& src_e = *src_i;
-
-            // neutral entries will be given special meaning for the purpose of revert, so
-            // get rid of empty rows from the input as if they were not there. This doesn't change
-            // the value of src.
-            if (src_e.empty()) {
-                src_i = src.erase_and_dispose(src_i, current_deleter<value_type>());
-                continue;
-            }
-
-            auto i = dst.lower_bound(src_e);
-            if (i == dst.end() || dst.key_comp()(src_e, *i)) {
-                // Construct neutral entry which will represent missing dst entry for revert.
-                value_type* empty_e = current_allocator().construct<value_type>(src_e.key());
-                [&] () noexcept {
-                    src_i = src.erase(src_i);
-                    src_i = src.insert_before(src_i, *empty_e);
-                    dst.insert_before(i, src_e);
-                }();
-            } else {
-                apply(*i, src_e);
-            }
-            ++src_i;
-        }
-        return defer([&dst, &src, revert] { revert_intrusive_set(dst, src, revert); });
-    } catch (...) {
-        revert_intrusive_set_range(dst, src, src.begin(), src_i, revert);
-        throw;
-    }
-}
-
 mutation_partition::mutation_partition(const mutation_partition& x)
        : _tombstone(x._tombstone)
        , _static_row(x._static_row)
@@ -249,12 +133,29 @@ mutation_partition::apply(const schema& s, const mutation_partition& p, const sc
    if (s.version() != p_schema.version()) {
        auto p2 = p;
        p2.upgrade(p_schema, s);
-        apply(s, std::move(p2));
+        apply(s, std::move(p2), s);
        return;
    }

-    mutation_partition tmp(p);
-    apply(s, std::move(tmp));
+    _tombstone.apply(p._tombstone);
+
+    for (auto&& e : p._row_tombstones) {
+        apply_row_tombstone(s, e.prefix(), e.t());
+    }
+
+    _static_row.merge(s, column_kind::static_column, p._static_row);
+
+    for (auto&& entry : p._rows) {
+        auto i = _rows.find(entry);
+        if (i == _rows.end()) {
+            auto e = current_allocator().construct<rows_entry>(entry);
+            _rows.insert(i, *e);
+        } else {
+            i->row().apply(entry.row().deleted_at());
+            i->row().apply(entry.row().marker());
+            i->row().cells().merge(s, column_kind::regular_column, entry.row().cells());
+        }
+    }
 }

 void
@@ -265,42 +166,42 @@ mutation_partition::apply(const schema& s, mutation_partition&& p, const schema&
        return;
    }

-    apply(s, std::move(p));
-}
+    _tombstone.apply(p._tombstone);

-void
-mutation_partition::apply(const schema& s, mutation_partition&& p) {
-    auto revert_row_tombstones = apply_reversibly_intrusive_set(_row_tombstones, p._row_tombstones);
-
-    _static_row.apply_reversibly(s, column_kind::static_column, p._static_row);
-    auto revert_static_row = defer([&] {
-        _static_row.revert(s, column_kind::static_column, p._static_row);
+    p._row_tombstones.clear_and_dispose([this, &s] (row_tombstones_entry* e) {
+        apply_row_tombstone(s, e);
    });

-    auto revert_rows = apply_reversibly_intrusive_set(_rows, p._rows,
-        [&s] (rows_entry& dst, rows_entry& src) { dst.apply_reversibly(s, src); },
-        [&s] (rows_entry& dst, rows_entry& src) noexcept { dst.revert(s, src); });
+    _static_row.merge(s, column_kind::static_column, std::move(p._static_row));

-    _tombstone.apply(p._tombstone); // noexcept
-
-    revert_rows.cancel();
-    revert_row_tombstones.cancel();
-    revert_static_row.cancel();
+    auto p_i = p._rows.begin();
+    auto p_end = p._rows.end();
+    while (p_i != p_end) {
+        rows_entry& entry = *p_i;
+        auto i = _rows.find(entry);
+        if (i == _rows.end()) {
+            p_i = p._rows.erase(p_i);
+            _rows.insert(i, entry);
+        } else {
+            i->row().apply(entry.row().deleted_at());
+            i->row().apply(entry.row().marker());
+            i->row().cells().merge(s, column_kind::regular_column, std::move(entry.row().cells()));
+            p_i = p._rows.erase_and_dispose(p_i, current_deleter<rows_entry>());
+        }
+    }
 }

 void
 mutation_partition::apply(const schema& s, mutation_partition_view p, const schema& p_schema) {
    if (p_schema.version() == s.version()) {
-        mutation_partition p2(*this, copy_comparators_only{});
-        partition_builder b(s, p2);
-        p.accept(s, b);
-        apply(s, std::move(p2));
+        mutation_partition_applier applier(s, *this);
+        p.accept(s, applier);
    } else {
        mutation_partition p2(*this, copy_comparators_only{});
        partition_builder b(p_schema, p2);
        p.accept(p_schema, b);
        p2.upgrade(p_schema, s);
-        apply(s, std::move(p2));
+        apply(s, std::move(p2), s);
    }
 }

@@ -448,25 +349,16 @@ mutation_partition::clustered_row(const schema& s, const clustering_key_view& ke
    return i->row();
 }

-mutation_partition::rows_type::const_iterator
-mutation_partition::lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const {
-    auto cmp = rows_entry::key_comparator(clustering_key_prefix::prefix_equality_less_compare(schema));
-    return r.start() ? (r.start()->is_inclusive()
-            ? _rows.lower_bound(r.start()->value(), cmp)
-            : _rows.upper_bound(r.start()->value(), cmp)) : _rows.cbegin();
-}
-
-mutation_partition::rows_type::const_iterator
-mutation_partition::upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const {
-    auto cmp = rows_entry::key_comparator(clustering_key_prefix::prefix_equality_less_compare(schema));
-    return r.end() ? (r.end()->is_inclusive()
-                         ? _rows.upper_bound(r.end()->value(), cmp)
-                         : _rows.lower_bound(r.end()->value(), cmp)) : _rows.cend();
-}
-
 boost::iterator_range<mutation_partition::rows_type::const_iterator>
 mutation_partition::range(const schema& schema, const query::range<clustering_key_prefix>& r) const {
-    return boost::make_iterator_range(lower_bound(schema, r), upper_bound(schema, r));
+    auto cmp = rows_entry::key_comparator(clustering_key_prefix::prefix_equality_less_compare(schema));
+    auto i1 = r.start() ? (r.start()->is_inclusive()
+            ? _rows.lower_bound(r.start()->value(), cmp)
+            : _rows.upper_bound(r.start()->value(), cmp)) : _rows.cbegin();
+    auto i2 = r.end() ? (r.end()->is_inclusive()
+            ? _rows.upper_bound(r.end()->value(), cmp)
+            : _rows.lower_bound(r.end()->value(), cmp)) : _rows.cend();
+    return boost::make_iterator_range(i1, i2);
 }

 template <typename Container>
@@ -478,27 +370,11 @@ unconst(Container& c, boost::iterator_range<typename Container::const_iterator>
    );
 }

-template <typename Container>
-typename Container::iterator
-unconst(Container& c, typename Container::const_iterator i) {
-    return c.erase(i, i);
-}
-
 boost::iterator_range<mutation_partition::rows_type::iterator>
 mutation_partition::range(const schema& schema, const query::range<clustering_key_prefix>& r) {
    return unconst(_rows, static_cast<const mutation_partition*>(this)->range(schema, r));
 }

-mutation_partition::rows_type::iterator
-mutation_partition::lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) {
-    return unconst(_rows, static_cast<const mutation_partition*>(this)->lower_bound(schema, r));
-}
-
-mutation_partition::rows_type::iterator
-mutation_partition::upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) {
-    return unconst(_rows, static_cast<const mutation_partition*>(this)->upper_bound(schema, r));
-}
-
 template<typename Func>
 void mutation_partition::for_each_row(const schema& schema, const query::range<clustering_key_prefix>& row_range, bool reversed, Func&& func) const
 {
@@ -551,33 +427,14 @@ void write_cell(RowWriter& w, const query::partition_slice& slice, const data_ty
        .end_qr_cell();
 }

-static void hash_row_slice(md5_hasher& hasher,
-    const schema& s,
-    column_kind kind,
-    const row& cells,
-    const std::vector<column_id>& columns)
-{
-    for (auto id : columns) {
-        const atomic_cell_or_collection* cell = cells.find_cell(id);
-        if (!cell) {
-            continue;
-        }
-        feed_hash(hasher, id);
-        auto&& def = s.column_at(kind, id);
-        if (def.is_atomic()) {
-            feed_hash(hasher, cell->as_atomic_cell());
-        } else {
-            feed_hash(hasher, cell->as_collection_mutation());
-        }
-    }
-}
-
 template<typename RowWriter>
-static void get_compacted_row_slice(const schema& s,
+static void get_row_slice(const schema& s,
    const query::partition_slice& slice,
    column_kind kind,
    const row& cells,
    const std::vector<column_id>& columns,
+    tombstone tomb,
+    gc_clock::time_point now,
    RowWriter& writer)
 {
    for (auto id : columns) {
@@ -588,7 +445,7 @@ static void get_compacted_row_slice(const schema& s,
            auto&& def = s.column_at(kind, id);
            if (def.is_atomic()) {
                auto c = cell->as_atomic_cell();
-                if (!c.is_live()) {
+                if (!c.is_live(tomb, now)) {
                    writer.add().skip();
                } else {
                    write_cell(writer, slice, cell->as_atomic_cell());
@@ -596,18 +453,21 @@ static void get_compacted_row_slice(const schema& s,
            } else {
                auto&& mut = cell->as_collection_mutation();
                auto&& ctype = static_pointer_cast<const collection_type_impl>(def.type);
-                if (!ctype->is_any_live(mut)) {
+                auto m_view = ctype->deserialize_mutation_form(mut);
+                m_view.tomb.apply(tomb);
+                // FIXME: Instead of this, write optimistically and retract if empty
+                auto m_ser = ctype->serialize_mutation_form_only_live(m_view, now);
+                if (ctype->is_empty(m_ser)) {
                    writer.add().skip();
                } else {
-                    write_cell(writer, slice, def.type, mut);
+                    write_cell(writer, slice, def.type, m_ser);
                }
            }
        }
    }
 }

-bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tombstone tomb = tombstone(),
-                       gc_clock::time_point now = gc_clock::time_point::min()) {
+bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tombstone tomb, gc_clock::time_point now) {
    bool any_live = false;
    cells.for_each_cell_until([&] (column_id id, const atomic_cell_or_collection& cell_or_collection) {
        const column_definition& def = s.column_at(kind, id);
@@ -630,32 +490,24 @@ bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tomb
    return any_live;
 }

-static bool has_ck_selector(const query::clustering_row_ranges& ranges) {
-    // Like PK range, an empty row range, should be considered an "exclude all" restriction
-    return ranges.empty() || std::any_of(ranges.begin(), ranges.end(), [](auto& r) {
-        return !r.is_full();
-    });
-}
-
-void
-mutation_partition::query_compacted(query::result::partition_writer& pw, const schema& s, uint32_t limit) const {
+uint32_t
+mutation_partition::query(query::result::partition_writer& pw,
+    const schema& s,
+    gc_clock::time_point now,
+    uint32_t limit) const
+{
    const query::partition_slice& slice = pw.slice();

    if (limit == 0) {
        pw.retract();
-        return;
+        return 0;
    }

    auto static_cells_wr = pw.start().start_static_row().start_cells();

    if (!slice.static_columns.empty()) {
-        if (pw.requested_result()) {
-            get_compacted_row_slice(s, slice, column_kind::static_column, static_row(), slice.static_columns, static_cells_wr);
-        }
-        if (pw.requested_digest()) {
-            ::feed_hash(pw.digest(), partition_tombstone());
-            hash_row_slice(pw.digest(), s, column_kind::static_column, static_row(), slice.static_columns);
-        }
+        get_row_slice(s, slice, column_kind::static_column, static_row(), slice.static_columns, partition_tombstone(),
+                      now, static_cells_wr);
    }

    auto rows_wr = std::move(static_cells_wr).end_cells()
@@ -664,37 +516,37 @@ mutation_partition::query_compacted(query::result::partition_writer& pw, const s

    uint32_t row_count = 0;

+    // Like PK range, an empty row range, should be considered an "exclude all" restriction
+    bool has_ck_selector = pw.ranges().empty();
+
    auto is_reversed = slice.options.contains(query::partition_slice::option::reversed);
-    auto send_ck = slice.options.contains(query::partition_slice::option::send_clustering_key);
-    for_each_row(s, query::clustering_range::make_open_ended_both_sides(), is_reversed, [&] (const rows_entry& e) {
-        auto& row = e.row();
-        auto row_tombstone = tombstone_for_row(s, e);
-
-        if (pw.requested_digest()) {
-            e.key().feed_hash(pw.digest(), s);
-            ::feed_hash(pw.digest(), row_tombstone);
-            hash_row_slice(pw.digest(), s, column_kind::regular_column, row.cells(), slice.regular_columns);
+    for (auto&& row_range : pw.ranges()) {
+        if (limit == 0) {
+            break;
        }

-        if (row.is_live(s)) {
-            if (pw.requested_result()) {
-                auto cells_wr = [&] {
-                    if (send_ck) {
-                        return rows_wr.add().write_key(e.key()).start_cells().start_cells();
-                    } else {
-                        return rows_wr.add().skip_key().start_cells().start_cells();
-                    }
-                }();
-                get_compacted_row_slice(s, slice, column_kind::regular_column, row.cells(), slice.regular_columns, cells_wr);
+        has_ck_selector |= !row_range.is_full();
+
+        // FIXME: Optimize for a full-tuple singular range. mutation_partition::range()
+        // does two lookups to form a range, even for singular range. We need
+        // only one lookup for a full-tuple singular range though.
+        for_each_row(s, row_range, is_reversed, [&] (const rows_entry& e) {
+            auto& row = e.row();
+            auto row_tombstone = tombstone_for_row(s, e);
+
+            if (row.is_live(s, row_tombstone, now)) {
+                auto cells_wr = rows_wr.add().write_key(e.key()).start_cells().start_cells();
+                get_row_slice(s, slice, column_kind::regular_column, row.cells(), slice.regular_columns, row_tombstone,
+                              now, cells_wr);
                std::move(cells_wr).end_cells().end_cells().end_qr_clustered_row();
+                ++row_count;
+                if (--limit == 0) {
+                    return stop_iteration::yes;
+                }
            }
-            ++row_count;
-            if (--limit == 0) {
-                return stop_iteration::yes;
-            }
-        }
-        return stop_iteration::no;
-    });
+            return stop_iteration::no;
+        });
+    }

    // If we got no rows, but have live static columns, we should only
    // give them back IFF we did not have any CK restrictions.
@@ -702,11 +554,17 @@ mutation_partition::query_compacted(query::result::partition_writer& pw, const s
    // If ck:s exist, and we do a restriction on them, we either have maching
    // rows, or return nothing, since cql does not allow "is null".
    if (row_count == 0
-			&& (has_ck_selector(pw.ranges())
-					|| !has_any_live_data(s, column_kind::static_column, static_row()))) {
+			&& (has_ck_selector
+					|| !has_any_live_data(s, column_kind::static_column,
+							static_row(), _tombstone, now))) {
 		pw.retract();
+        return 0;
 	} else {
        std::move(rows_wr).end_rows().end_qr_partition();
+
+        // The partition is live. If there are no clustered rows, there
+        // must be something live in the static row, which counts as one row.
+        return std::max<uint32_t>(row_count, 1);
 	}
 }

@@ -766,7 +624,7 @@ operator<<(std::ostream& os, const mutation_partition& mp) {
 constexpr gc_clock::duration row_marker::no_ttl;
 constexpr gc_clock::duration row_marker::dead;

-int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept {
+int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) {
    if (left.timestamp() != right.timestamp()) {
        return left.timestamp() > right.timestamp() ? 1 : -1;
    }
@@ -802,18 +660,6 @@ deletable_row::equal(column_kind kind, const schema& s, const deletable_row& oth
    return _cells.equal(kind, s, other._cells, other_schema);
 }

-void deletable_row::apply_reversibly(const schema& s, deletable_row& src) {
-    _cells.apply_reversibly(s, column_kind::regular_column, src._cells);
-    _deleted_at.apply_reversibly(src._deleted_at); // noexcept
-    _marker.apply_reversibly(src._marker); // noexcept
-}
-
-void deletable_row::revert(const schema& s, deletable_row& src) {
-    _cells.revert(s, column_kind::regular_column, src._cells);
-    _deleted_at.revert(src._deleted_at);
-    _marker.revert(src._marker);
-}
-
 bool
 rows_entry::equal(const schema& s, const rows_entry& other) const {
    return equal(s, other, s);
@@ -858,123 +704,42 @@ bool mutation_partition::equal(const schema& this_schema, const mutation_partiti
 }

 void
-apply_reversibly(const column_definition& def, atomic_cell_or_collection& dst,  atomic_cell_or_collection& src) {
+merge_column(const column_definition& def,
+             atomic_cell_or_collection& old,
+             atomic_cell_or_collection&& neww) {
    // Must be run via with_linearized_managed_bytes() context, but assume it is
    // provided via an upper layer
    if (def.is_atomic()) {
-        auto&& src_ac = src.as_atomic_cell_ref();
-        if (compare_atomic_cell_for_merge(dst.as_atomic_cell(), src.as_atomic_cell()) < 0) {
-            std::swap(dst, src);
-            src_ac.set_revert(true);
-        } else {
-            src_ac.set_revert(false);
+        if (compare_atomic_cell_for_merge(old.as_atomic_cell(), neww.as_atomic_cell()) < 0) {
+            old = std::move(neww);
        }
    } else {
        auto ct = static_pointer_cast<const collection_type_impl>(def.type);
-        src = ct->merge(dst.as_collection_mutation(), src.as_collection_mutation());
-        std::swap(dst, src);
-    }
-}
-
-void
-revert(const column_definition& def, atomic_cell_or_collection& dst, atomic_cell_or_collection& src) noexcept {
-    static_assert(std::is_nothrow_move_constructible<atomic_cell_or_collection>::value
-                  && std::is_nothrow_move_assignable<atomic_cell_or_collection>::value,
-                  "for std::swap() to be noexcept");
-    if (def.is_atomic()) {
-        auto&& ac = src.as_atomic_cell_ref();
-        if (ac.is_revert_set()) {
-            ac.set_revert(false);
-            std::swap(dst, src);
-        }
-    } else {
-        std::swap(dst, src);
+        old = ct->merge(old.as_collection_mutation(), neww.as_collection_mutation());
    }
 }

 void
 row::apply(const column_definition& column, const atomic_cell_or_collection& value) {
+    // FIXME: Optimize
    atomic_cell_or_collection tmp(value);
    apply(column, std::move(tmp));
 }

 void
 row::apply(const column_definition& column, atomic_cell_or_collection&& value) {
-    apply_reversibly(column, value);
-}
-
-template<typename Func, typename Rollback>
-void row::for_each_cell(Func&& func, Rollback&& rollback) {
-    static_assert(noexcept(rollback(std::declval<column_id>(), std::declval<atomic_cell_or_collection&>())),
-                           "rollback must be noexcept");
-
-    if (_type == storage_type::vector) {
-        unsigned i = 0;
-        try {
-            for (; i < _storage.vector.v.size(); i++) {
-                if (_storage.vector.present.test(i)) {
-                    func(i, _storage.vector.v[i]);
-                }
-            }
-        } catch (...) {
-            while (i) {
-                --i;
-                if (_storage.vector.present.test(i)) {
-                    rollback(i, _storage.vector.v[i]);
-                }
-            }
-            throw;
-        }
-    } else {
-        auto i = _storage.set.begin();
-        try {
-            while (i != _storage.set.end()) {
-                func(i->id(), i->cell());
-                ++i;
-            }
-        } catch (...) {
-            while (i != _storage.set.begin()) {
-                --i;
-                rollback(i->id(), i->cell());
-            }
-            throw;
-        }
-    }
-}
-
-template<typename Func>
-void row::for_each_cell(Func&& func) {
-    if (_type == storage_type::vector) {
-        for (auto i : bitsets::for_each_set(_storage.vector.present)) {
-            func(i, _storage.vector.v[i]);
-        }
-    } else {
-        for (auto& cell : _storage.set) {
-            func(cell.id(), cell.cell());
-        }
-    }
-}
-
-void
-row::apply_reversibly(const column_definition& column, atomic_cell_or_collection& value) {
-    static_assert(std::is_nothrow_move_constructible<atomic_cell_or_collection>::value
-                  && std::is_nothrow_move_assignable<atomic_cell_or_collection>::value,
-                  "noexcept required for atomicity");
-
    // our mutations are not yet immutable
    auto id = column.id;
    if (_type == storage_type::vector && id < max_vector_size) {
-        if (id >= _storage.vector.v.size()) {
-            _storage.vector.v.resize(id);
-            _storage.vector.v.emplace_back(std::move(value));
-            _storage.vector.present.set(id);
+        if (id >= _storage.vector.size()) {
+            _storage.vector.resize(id);
+            _storage.vector.emplace_back(std::move(value));
            _size++;
-        } else if (!bool(_storage.vector.v[id])) {
-            _storage.vector.v[id] = std::move(value);
-            _storage.vector.present.set(id);
+        } else if (!bool(_storage.vector[id])) {
+            _storage.vector[id] = std::move(value);
            _size++;
        } else {
-            ::apply_reversibly(column, _storage.vector.v[id], value);
+            merge_column(column, _storage.vector[id], std::move(value));
        }
    } else {
        if (_type == storage_type::vector) {
@@ -982,37 +747,11 @@ row::apply_reversibly(const column_definition& column, atomic_cell_or_collection
        }
        auto i = _storage.set.lower_bound(id, cell_entry::compare());
        if (i == _storage.set.end() || i->id() != id) {
-            cell_entry* e = current_allocator().construct<cell_entry>(id);
-            std::swap(e->_cell, value);
+            auto e = current_allocator().construct<cell_entry>(id, std::move(value));
            _storage.set.insert(i, *e);
            _size++;
        } else {
-            ::apply_reversibly(column, i->cell(), value);
-        }
-    }
-}
-
-void
-row::revert(const column_definition& column, atomic_cell_or_collection& src) noexcept {
-    auto id = column.id;
-    if (_type == storage_type::vector) {
-        auto& dst = _storage.vector.v[id];
-        if (!src) {
-            std::swap(dst, src);
-            _storage.vector.present.reset(id);
-            --_size;
-        } else {
-            ::revert(column, dst, src);
-        }
-    } else {
-        auto i = _storage.set.find(id, cell_entry::compare());
-        auto& dst = i->cell();
-        if (!src) {
-            std::swap(dst, src);
-            _storage.set.erase_and_dispose(i, current_deleter<cell_entry>());
-            --_size;
-        } else {
-            ::revert(column, dst, src);
+            merge_column(column, i->cell(), std::move(value));
        }
    }
 }
@@ -1020,9 +759,8 @@ row::revert(const column_definition& column, atomic_cell_or_collection& src) noe
 void
 row::append_cell(column_id id, atomic_cell_or_collection value) {
    if (_type == storage_type::vector && id < max_vector_size) {
-        _storage.vector.v.resize(id);
-        _storage.vector.v.emplace_back(std::move(value));
-        _storage.vector.present.set(id);
+        _storage.vector.resize(id);
+        _storage.vector.emplace_back(std::move(value));
    } else {
        if (_type == storage_type::vector) {
            vector_to_set();
@@ -1036,10 +774,10 @@ row::append_cell(column_id id, atomic_cell_or_collection value) {
 const atomic_cell_or_collection*
 row::find_cell(column_id id) const {
    if (_type == storage_type::vector) {
-        if (id >= _storage.vector.v.size() || !_storage.vector.present.test(id)) {
+        if (id >= _storage.vector.size() || !bool(_storage.vector[id])) {
            return nullptr;
        }
-        return &_storage.vector.v[id];
+        return &_storage.vector[id];
    } else {
        auto i = _storage.set.find(id, cell_entry::compare());
        if (i == _storage.set.end()) {
@@ -1060,24 +798,15 @@ void mutation_partition::trim_rows(const schema& s,
    auto last = reversal_traits<reversed>::begin(_rows);
    auto deleter = current_deleter<rows_entry>();

-    auto range_begin = [this, &s] (const query::clustering_range& range) {
-        return reversed ? upper_bound(s, range) : lower_bound(s, range);
-    };
-
-    auto range_end = [this, &s] (const query::clustering_range& range) {
-        return reversed ? lower_bound(s, range) : upper_bound(s, range);
-    };
-
    for (auto&& row_range : row_ranges) {
        if (stop) {
            break;
        }

-        last = reversal_traits<reversed>::erase_and_dispose(_rows, last,
-            reversal_traits<reversed>::maybe_reverse(_rows, range_begin(row_range)), deleter);
+        auto it_range = reversal_traits<reversed>::maybe_reverse(_rows, range(s, row_range));
+        last = reversal_traits<reversed>::erase_and_dispose(_rows, last, it_range.begin(), deleter);

-        auto end = reversal_traits<reversed>::maybe_reverse(_rows, range_end(row_range));
-        while (last != end) {
+        while (last != it_range.end()) {
            rows_entry& e = *last;
            if (func(e) == stop_iteration::yes) {
                stop = true;
@@ -1149,7 +878,10 @@ uint32_t mutation_partition::do_compact(const schema& s,

    // #589 - Do not add extra row for statics unless we did a CK range-less query.
    // See comment in query
-    if (row_count == 0 && static_row_live && !has_ck_selector(row_ranges)) {
+    if (row_count == 0 && static_row_live
+            && std::any_of(row_ranges.begin(), row_ranges.end(), [](auto& r) {
+                return r.is_full();
+            })) {
        ++row_count;
    }

@@ -1202,7 +934,7 @@ bool mutation_partition::empty() const
 }

 bool
-deletable_row::is_live(const schema& s, tombstone base_tombstone, gc_clock::time_point query_time) const {
+deletable_row::is_live(const schema& s, tombstone base_tombstone, gc_clock::time_point query_time = gc_clock::time_point::min()) const {
    // _created_at corresponds to the row marker cell, present for rows
    // created with the 'insert' statement. If row marker is live, we know the
    // row is live. Otherwise, a row is considered live if it has any cell
@@ -1259,7 +991,7 @@ row::row(const row& o)
    , _size(o._size)
 {
    if (_type == storage_type::vector) {
-        new (&_storage.vector) vector_storage(o._storage.vector);
+        new (&_storage.vector) vector_type(o._storage.vector);
    } else {
        auto cloner = [] (const auto& x) {
            return current_allocator().construct<std::remove_const_t<std::remove_reference_t<decltype(x)>>>(x);
@@ -1276,14 +1008,14 @@ row::row(const row& o)

 row::~row() {
    if (_type == storage_type::vector) {
-        _storage.vector.~vector_storage();
+        _storage.vector.~vector_type();
    } else {
        _storage.set.clear_and_dispose(current_deleter<cell_entry>());
        _storage.set.~map_type();
    }
 }

-row::cell_entry::cell_entry(const cell_entry& o)
+row::cell_entry::cell_entry(const cell_entry& o) noexcept
    : _id(o._id)
    , _cell(o._cell)
 { }
@@ -1310,20 +1042,15 @@ void row::vector_to_set()
 {
    assert(_type == storage_type::vector);
    map_type set;
-    try {
-    for (auto i : bitsets::for_each_set(_storage.vector.present)) {
-        auto& c = _storage.vector.v[i];
+    for (unsigned i = 0; i < _storage.vector.size(); i++) {
+        auto& c = _storage.vector[i];
+        if (!bool(c)) {
+            continue;
+        }
        auto e = current_allocator().construct<cell_entry>(i, std::move(c));
        set.insert(set.end(), *e);
    }
-    } catch (...) {
-        set.clear_and_dispose([this, del = current_deleter<cell_entry>()] (cell_entry* ce) noexcept {
-            _storage.vector.v[ce->id()] = std::move(ce->cell());
-            del(ce);
-        });
-        throw;
-    }
-    _storage.vector.~vector_storage();
+    _storage.vector.~vector_type();
    new (&_storage.set) map_type(std::move(set));
    _type = storage_type::set;
 }
@@ -1334,7 +1061,7 @@ void row::reserve(column_id last_column)
        if (last_column >= max_vector_size) {
            vector_to_set();
        } else {
-            _storage.vector.v.reserve(last_column);
+            _storage.vector.reserve(last_column);
        }
    }
 }
@@ -1387,13 +1114,13 @@ bool row::equal(column_kind kind, const schema& this_schema, const row& other, c
 }

 row::row() {
-    new (&_storage.vector) vector_storage;
+    new (&_storage.vector) vector_type;
 }

 row::row(row&& other)
    : _type(other._type), _size(other._size) {
    if (_type == storage_type::vector) {
-        new (&_storage.vector) vector_storage(std::move(other._storage.vector));
+        new (&_storage.vector) vector_type(std::move(other._storage.vector));
    } else {
        new (&_storage.set) map_type(std::move(other._storage.set));
    }
@@ -1407,25 +1134,27 @@ row& row::operator=(row&& other) {
    return *this;
 }

-void row::apply_reversibly(const schema& s, column_kind kind, row& other) {
-    if (other.empty()) {
-        return;
-    }
+void row::merge(const schema& s, column_kind kind, const row& other) {
    if (other._type == storage_type::vector) {
-        reserve(other._storage.vector.v.size() - 1);
+        reserve(other._storage.vector.size() - 1);
    } else {
        reserve(other._storage.set.rbegin()->id());
    }
-    other.for_each_cell([&] (column_id id, atomic_cell_or_collection& cell) {
-        apply_reversibly(s.column_at(kind, id), cell);
-    }, [&] (column_id id, atomic_cell_or_collection& cell) noexcept {
-        revert(s.column_at(kind, id), cell);
+    other.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+        apply(s.column_at(kind, id), cell);
    });
 }

-void row::revert(const schema& s, column_kind kind, row& other) noexcept {
-    other.for_each_cell([&] (column_id id, atomic_cell_or_collection& cell) noexcept {
-        revert(s.column_at(kind, id), cell);
+void row::merge(const schema& s, column_kind kind, row&& other) {
+    if (other._type == storage_type::vector) {
+        reserve(other._storage.vector.size() - 1);
+    } else {
+        reserve(other._storage.set.rbegin()->id());
+    }
+    // FIXME: Optimize when 'other' is a set. We could move whole entries, not only cells.
+    other.for_each_cell_until([&] (column_id id, atomic_cell_or_collection& cell) {
+        apply(s.column_at(kind, id), std::move(cell));
+        return stop_iteration::no;
    });
 }

@@ -1576,15 +1305,3 @@ mutation_partition::upgrade(const schema& old_schema, const schema& new_schema)
    accept(old_schema, v);
    *this = std::move(tmp);
 }
-
-void row_marker::apply_reversibly(row_marker& rm) noexcept {
-    if (compare_row_marker_for_merge(*this, rm) < 0) {
-        std::swap(*this, rm);
-    } else {
-        rm = *this;
-    }
-}
-
-void row_marker::revert(row_marker& rm) noexcept {
-    std::swap(*this, rm);
-}
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -28,8 +28,6 @@
 #include <boost/range/adaptor/indexed.hpp>
 #include <boost/range/adaptor/filtered.hpp>

-#include <seastar/core/bitset-iter.hh>
-
 #include "schema.hh"
 #include "tombstone.hh"
 #include "keys.hh"
@@ -60,11 +58,8 @@ class row {
            : _id(id)
            , _cell(std::move(cell))
        { }
-        cell_entry(column_id id)
-            : _id(id)
-        { }
        cell_entry(cell_entry&&) noexcept;
-        cell_entry(const cell_entry&);
+        cell_entry(const cell_entry&) noexcept;

        column_id id() const { return _id; }
        const atomic_cell_or_collection& cell() const { return _cell; }
@@ -101,16 +96,11 @@ public:
 private:
    using vector_type = managed_vector<atomic_cell_or_collection, internal_count, size_type>;

-    struct vector_storage {
-        std::bitset<max_vector_size> present;
-        vector_type v;
-    };
-
    union storage {
        storage() { }
        ~storage() { }
        map_type set;
-        vector_storage vector;
+        vector_type vector;
    } _storage;
 public:
    row();
@@ -119,7 +109,6 @@ public:
    row(row&& other);
    row& operator=(row&& other);
    size_t size() const { return _size; }
-    bool empty() const { return _size == 0; }

    void reserve(column_id);

@@ -131,14 +120,13 @@ private:
    template<typename Func>
    void remove_if(Func&& func) {
        if (_type == storage_type::vector) {
-            for (unsigned i = 0; i < _storage.vector.v.size(); i++) {
-                if (!_storage.vector.present.test(i)) {
+            for (unsigned i = 0; i < _storage.vector.size(); i++) {
+                auto& c = _storage.vector[i];
+                if (!bool(c)) {
                    continue;
                }
-                auto& c = _storage.vector.v[i];
                if (func(i, c)) {
                    c = atomic_cell_or_collection();
-                    _storage.vector.present.reset(i);
                    _size--;
                }
            }
@@ -158,12 +146,11 @@ private:

 private:
    auto get_range_vector() const {
-        auto id_range = boost::irange<column_id>(0, _storage.vector.v.size());
-        return boost::combine(id_range, _storage.vector.v)
-        | boost::adaptors::filtered([this] (const boost::tuple<const column_id&, const atomic_cell_or_collection&>& t) {
-            return _storage.vector.present.test(t.get<0>());
-        }) | boost::adaptors::transformed([] (const boost::tuple<const column_id&, const atomic_cell_or_collection&>& t) {
-            return std::pair<column_id, const atomic_cell_or_collection&>(t.get<0>(), t.get<1>());
+        auto range = boost::make_iterator_range(_storage.vector.begin(), _storage.vector.end());
+        return range | boost::adaptors::filtered([] (const atomic_cell_or_collection& c) { return bool(c); })
+               | boost::adaptors::transformed([this] (const atomic_cell_or_collection& c) {
+            auto id = &c - _storage.vector.data();
+            return std::pair<column_id, const atomic_cell_or_collection&>(id, std::cref(c));
        });
    }
    auto get_range_set() const {
@@ -176,23 +163,7 @@ private:
    auto with_both_ranges(const row& other, Func&& func) const;

    void vector_to_set();
-
-    // Calls Func(column_id, atomic_cell_or_collection&) for each cell in this row.
-    //
-    // Func() is allowed to modify the cell. Emptying a cell makes it still
-    // visible to for_each().
-    //
-    // In case of exception, calls Rollback(column_id, atomic_cell_or_collection&) on
-    // all cells on which Func() was successfully invoked in reverse order.
-    //
-    template<typename Func, typename Rollback>
-    void for_each_cell(Func&&, Rollback&&);
 public:
-    // Calls Func(column_id, atomic_cell_or_collection&) for each cell in this row.
-    // noexcept if Func doesn't throw.
-    template<typename Func>
-    void for_each_cell(Func&&);
-
    template<typename Func>
    void for_each_cell(Func&& func) const {
        for_each_cell_until([func = std::forward<Func>(func)] (column_id id, const atomic_cell_or_collection& c) {
@@ -204,8 +175,11 @@ public:
    template<typename Func>
    void for_each_cell_until(Func&& func) const {
        if (_type == storage_type::vector) {
-            for (auto i : bitsets::for_each_set(_storage.vector.present)) {
-                auto& cell = _storage.vector.v[i];
+            for (unsigned i = 0; i < _storage.vector.size(); i++) {
+                auto& cell = _storage.vector[i];
+                if (!bool(cell)) {
+                    continue;
+                }
                if (func(i, cell) == stop_iteration::yes) {
                    break;
                }
@@ -213,7 +187,29 @@ public:
        } else {
            for (auto& cell : _storage.set) {
                const auto& c = cell.cell();
-                if (func(cell.id(), c) == stop_iteration::yes) {
+                if (c && func(cell.id(), c) == stop_iteration::yes) {
+                    break;
+                }
+            }
+        }
+    }
+
+    template<typename Func>
+    void for_each_cell_until(Func&& func) {
+        if (_type == storage_type::vector) {
+            for (unsigned i = 0; i < _storage.vector.size(); i++) {
+                auto& cell = _storage.vector[i];
+                if (!bool(cell)) {
+                    continue;
+                }
+                if (func(i, cell) == stop_iteration::yes) {
+                    break;
+                }
+            }
+        } else {
+            for (auto& cell : _storage.set) {
+                auto& c = cell.cell();
+                if (c && func(cell.id(), c) == stop_iteration::yes) {
                    break;
                }
            }
@@ -226,26 +222,21 @@ public:
    //
    // Merges cell's value into the row.
    //
-    // In case of exception the current object is left with a value equivalent to the original state.
-    //
-    // The external cell is left in a valid state, such that it will commute with
-    // current object to the same value should the exception had not occurred.
+    // In case of exception the current object and external object (moved-from)
+    // are both left in some valid states, such that they still will commute to
+    // a state the current object would have should the exception had not occurred.
    //
    void apply(const column_definition& column, atomic_cell_or_collection&& cell);

-    // Equivalent to calling apply_reversibly() with a row containing only given cell.
-    // See reversibly_mergeable.hh
-    void apply_reversibly(const column_definition& column, atomic_cell_or_collection& cell);
-    // See reversibly_mergeable.hh
-    void revert(const column_definition& column, atomic_cell_or_collection& cell) noexcept;
-
    // Adds cell to the row. The column must not be already set.
    void append_cell(column_id id, atomic_cell_or_collection cell);

-    // See reversibly_mergeable.hh
-    void apply_reversibly(const schema&, column_kind, row& src);
-    // See reversibly_mergeable.hh
-    void revert(const schema&, column_kind, row& src) noexcept;
+    void merge(const schema& s, column_kind kind, const row& other);
+
+    // In case of exception the current object and external object (moved-from)
+    // are both left in some valid states, such that they still will commute to
+    // a state the current object would have should the exception had not occurred.
+    void merge(const schema& s, column_kind kind, row&& other);

    // Expires cells based on query_time. Expires tombstones based on gc_before
    // and max_purgeable. Removes cells covered by tomb.
@@ -267,7 +258,7 @@ public:
 std::ostream& operator<<(std::ostream& os, const std::pair<column_id, const atomic_cell_or_collection&>& c);

 class row_marker;
-int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept;
+int compare_row_marker_for_merge(const row_marker& left, const row_marker& right);

 class row_marker {
    static constexpr gc_clock::duration no_ttl { 0 };
@@ -330,10 +321,6 @@ public:
            *this = rm;
        }
    }
-    // See reversibly_mergeable.hh
-    void apply_reversibly(row_marker& rm) noexcept;
-    // See reversibly_mergeable.hh
-    void revert(row_marker& rm) noexcept;
    // Expires cells and tombstones. Removes items covered by higher level
    // tombstones.
    // Returns true if row marker is live.
@@ -411,11 +398,6 @@ public:
    void remove_tombstone() {
        _deleted_at = tombstone();
    }
-
-    // See reversibly_mergeable.hh
-    void apply_reversibly(const schema& s, deletable_row& src);
-    // See reversibly_mergeable.hh
-    void revert(const schema& s, deletable_row& src);
 public:
    tombstone deleted_at() const { return _deleted_at; }
    api::timestamp_type created_at() const { return _marker.timestamp(); }
@@ -425,7 +407,7 @@ public:
    row& cells() { return _cells; }
    friend std::ostream& operator<<(std::ostream& os, const deletable_row& dr);
    bool equal(column_kind, const schema& s, const deletable_row& other, const schema& other_schema) const;
-    bool is_live(const schema& s, tombstone base_tombstone = tombstone(), gc_clock::time_point query_time = gc_clock::time_point::min()) const;
+    bool is_live(const schema& s, tombstone base_tombstone, gc_clock::time_point query_time) const;
    bool empty() const { return !_deleted_at && _marker.is_missing() && !_cells.size(); }
    deletable_row difference(const schema&, column_kind, const deletable_row& other) const;
 };
@@ -440,9 +422,6 @@ public:
        : _prefix(std::move(prefix))
        , _t(std::move(t))
    { }
-    row_tombstones_entry(const clustering_key_prefix& prefix)
-        : _prefix(prefix)
-    { }
    row_tombstones_entry(row_tombstones_entry&& o) noexcept;
    row_tombstones_entry(const row_tombstones_entry&) = default;
    clustering_key_prefix& prefix() {
@@ -451,9 +430,6 @@ public:
    const clustering_key_prefix& prefix() const {
        return _prefix;
    }
-    const clustering_key_prefix& key() const {
-        return _prefix;
-    }
    tombstone& t() {
        return _t;
    }
@@ -463,14 +439,6 @@ public:
    void apply(tombstone t) {
        _t.apply(t);
    }
-    // See reversibly_mergeable.hh
-    void apply_reversibly(row_tombstones_entry& e) {
-        _t.apply_reversibly(e._t);
-    }
-    // See reversibly_mergeable.hh
-    void revert(row_tombstones_entry& e) noexcept {
-        _t.revert(e._t);
-    }
    struct compare {
        clustering_key_prefix::less_compare _c;
        compare(const schema& s) : _c(s) {}
@@ -504,9 +472,6 @@ public:

    friend std::ostream& operator<<(std::ostream& os, const row_tombstones_entry& rte);
    bool equal(const schema& s, const row_tombstones_entry& other) const;
-    bool empty() const {
-        return !_t;
-    }
 };

 class rows_entry {
@@ -547,14 +512,6 @@ public:
    void apply(tombstone t) {
        _row.apply(t);
    }
-    // See reversibly_mergeable.hh
-    void apply_reversibly(const schema& s, rows_entry& e) {
-        _row.apply_reversibly(s, e._row);
-    }
-    // See reversibly_mergeable.hh
-    void revert(const schema& s, rows_entry& e) noexcept {
-        _row.revert(s, e._row);
-    }
    bool empty() const {
        return _row.empty();
    }
@@ -613,8 +570,8 @@ class mutation_partition final {
    using row_tombstones_type = boost::intrusive::set<row_tombstones_entry,
        boost::intrusive::member_hook<row_tombstones_entry, boost::intrusive::set_member_hook<>, &row_tombstones_entry::_link>,
        boost::intrusive::compare<row_tombstones_entry::compare>>;
-    friend class rows_entry;
-    friend class row_tombstones_entry;
+    friend rows_entry;
+    friend row_tombstones_entry;
    friend class size_calculator;
 private:
    tombstone _tombstone;
@@ -669,21 +626,19 @@ public:
    // Commutative when this_schema == p_schema. If schemas differ, data in p which
    // is not representable in this_schema is dropped, thus apply() loses commutativity.
    //
-    // Strong exception guarantees.
+    // Basic exception guarantees. If apply() throws after being called in
+    // some entry state p0, the object is left in some consistent state p1 and
+    // it's possible that p1 != p0 + p. It holds though that p1 + p = p0 + p.
+    //
+    // FIXME: make stronger exception guarantees (p1 = p0).
    void apply(const schema& this_schema, const mutation_partition& p, const schema& p_schema);
    //
-    // Applies p to current object.
+    // Same guarantees as for apply(const schema&, const mutation_partition&).
    //
-    // Commutative when this_schema == p_schema. If schemas differ, data in p which
-    // is not representable in this_schema is dropped, thus apply() loses commutativity.
-    //
-    // If exception is thrown, this object will be left in a state equivalent to the entry state
-    // and p will be left in a state which will commute with current object to the same value
-    // should the exception had not occurred.
+    // In case of exception the current object and external object (moved-from)
+    // are both left in some valid states, such that they still will commute to
+    // a state the current object would have should the exception had not occurred.
    void apply(const schema& this_schema, mutation_partition&& p, const schema& p_schema);
-    // Use in case this instance and p share the same schema.
-    // Same guarantees as apply(const schema&, mutation_partition&&, const schema&);
-    void apply(const schema& s, mutation_partition&& p);
    // Same guarantees and constraints as for apply(const schema&, const mutation_partition&, const schema&).
    void apply(const schema& this_schema, mutation_partition_view p, const schema& p_schema);

@@ -762,16 +717,9 @@ public:
    tombstone tombstone_for_row(const schema& schema, const clustering_key& key) const;
    tombstone tombstone_for_row(const schema& schema, const rows_entry& e) const;
    boost::iterator_range<rows_type::const_iterator> range(const schema& schema, const query::range<clustering_key_prefix>& r) const;
-    rows_type::const_iterator lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const;
-    rows_type::const_iterator upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const;
-    rows_type::iterator lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r);
-    rows_type::iterator upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r);
    boost::iterator_range<rows_type::iterator> range(const schema& schema, const query::range<clustering_key_prefix>& r);
-    // Writes this partition using supplied query result writer.
-    // The partition should be first compacted with compact_for_query(), otherwise
-    // results may include data which is deleted/expired.
-    // At most row_limit CQL rows will be written and digested.
-    void query_compacted(query::result::partition_writer& pw, const schema& s, uint32_t row_limit) const;
+    // Returns the number of live CQL rows written. No more than limit.
+    uint32_t query(query::result::partition_writer& pw, const schema& s, gc_clock::time_point now, uint32_t limit = query::max_rows) const;
    void accept(const schema&, mutation_partition_visitor&) const;

    // Returns the number of live CQL rows in this partition.
--- a/mutation_query.cc
+++ b/mutation_query.cc
@@ -55,98 +55,77 @@ bool reconcilable_result::operator!=(const reconcilable_result& other) const {

 query::result
 to_data_query_result(const reconcilable_result& r, schema_ptr s, const query::partition_slice& slice) {
-    query::result::builder builder(slice, query::result_request::only_result);
+    query::result::builder builder(slice);
    for (const partition& p : r.partitions()) {
-        p.mut().unfreeze(s).query(builder, slice, gc_clock::time_point::min(), query::max_rows);
+        auto pb = builder.add_partition(*s, p._m.key(*s));
+        p.mut().unfreeze(s).partition().query(pb, *s, gc_clock::time_point::min(), query::max_rows);
    }
    return builder.build();
 }

-
-querying_reader::querying_reader(schema_ptr s,
-        const mutation_source& source,
-        const query::partition_range& range,
-        const query::partition_slice& slice,
-        uint32_t row_limit,
-        gc_clock::time_point query_time,
-        std::function<void(uint32_t, mutation&&)> consumer)
-    : _schema(std::move(s))
-    , _range(range)
-    , _slice(slice)
-    , _requested_limit(row_limit)
-    , _query_time(query_time)
-    , _limit(row_limit)
-    , _source(source)
-    , _consumer(std::move(consumer))
-{ }
-
-future<> querying_reader::read() {
-    _reader = _source(_schema, _range, service::get_local_sstable_query_read_priority());
-    return consume(*_reader, [this](mutation&& m) {
-        // FIXME: Make data sources respect row_ranges so that we don't have to filter them out here.
-        auto is_distinct = _slice.options.contains(query::partition_slice::option::distinct);
-        auto is_reversed = _slice.options.contains(query::partition_slice::option::reversed);
-        auto limit = !is_distinct ? _limit : 1;
-        auto rows_left = m.partition().compact_for_query(*m.schema(), _query_time,
-                                                         _slice.row_ranges(*m.schema(), m.key()),
-                                                         is_reversed, limit);
-        _limit -= rows_left;
-
-        if (rows_left || !m.partition().empty()) {
-            // NOTE: We must return all columns, regardless of what's in
-            // partition_slice, for the results to be reconcilable with tombstones.
-            // That's because row's presence depends on existence of any
-            // column in a row (See mutation_partition::query). We could
-            // optimize this case and only send cell timestamps, without data,
-            // for the cells which are not queried for (TODO).
-            _consumer(rows_left, std::move(m));
-        }
-
-        return _limit ? stop_iteration::no : stop_iteration::yes;
-    });
-}
-
-class reconcilable_result_builder {
-    querying_reader _reader;
-    std::vector<partition> _result;
-    uint32_t _total = 0;
-public:
-    reconcilable_result_builder(schema_ptr s,
-        const mutation_source& source,
-        const query::partition_range& range,
-        const query::partition_slice& slice,
-        uint32_t row_limit,
-        gc_clock::time_point query_time)
-            : _reader(std::move(s), source, range, slice, row_limit, query_time, [this] (uint32_t live_rows, mutation&& m) {
-                _result.emplace_back(partition{live_rows, freeze(m)});
-                _total += live_rows;
-            })
-    { }
-
-    reconcilable_result_builder(reconcilable_result_builder&&) = delete; // this captured
-
-    future<reconcilable_result> build() {
-        return _reader.read().then([this] {
-            return make_ready_future<reconcilable_result>(reconcilable_result(_total, std::move(_result)));
-        });
-    }
-};
-
 future<reconcilable_result>
 mutation_query(schema_ptr s,
-               const mutation_source& source,
-               const query::partition_range& range,
-               const query::partition_slice& slice,
-               uint32_t row_limit,
-               gc_clock::time_point query_time)
+    const mutation_source& source,
+    const query::partition_range& range,
+    const query::partition_slice& slice,
+    uint32_t row_limit,
+    gc_clock::time_point query_time)
 {
+    struct query_state {
+        const query::partition_range& range;
+        const query::partition_slice& slice;
+        uint32_t requested_limit;
+        gc_clock::time_point query_time;
+        uint32_t limit;
+        mutation_reader reader;
+        std::vector<partition> result;
+
+        query_state(
+            const query::partition_range& range,
+            const query::partition_slice& slice,
+            uint32_t requested_limit,
+            gc_clock::time_point query_time
+        )
+            : range(range)
+            , slice(slice)
+            , requested_limit(requested_limit)
+            , query_time(query_time)
+            , limit(requested_limit)
+        { }
+    };
+
    if (row_limit == 0) {
        return make_ready_future<reconcilable_result>(reconcilable_result());
    }

-    auto b_ptr = std::make_unique<reconcilable_result_builder>(std::move(s), source, range, slice, row_limit, query_time);
-    auto& b = *b_ptr;
-    return b.build().finally([keep = std::move(b_ptr)] {});
+    return do_with(query_state(range, slice, row_limit, query_time),
+                   [&source, s = std::move(s)] (query_state& state) -> future<reconcilable_result> {
+        state.reader = source(std::move(s), state.range, service::get_local_sstable_query_read_priority());
+        return consume(state.reader, [&state] (mutation&& m) {
+            // FIXME: Make data sources respect row_ranges so that we don't have to filter them out here.
+            auto is_distinct = state.slice.options.contains(query::partition_slice::option::distinct);
+            auto is_reversed = state.slice.options.contains(query::partition_slice::option::reversed);
+            auto limit = !is_distinct ? state.limit : 1;
+            auto rows_left = m.partition().compact_for_query(*m.schema(), state.query_time, state.slice.row_ranges(*m.schema(), m.key()),
+                is_reversed, limit);
+            state.limit -= rows_left;
+
+            if (rows_left || !m.partition().empty()) {
+                // NOTE: We must return all columns, regardless of what's in
+                // partition_slice, for the results to be reconcilable with tombstones.
+                // That's because row's presence depends on existence of any
+                // column in a row (See mutation_partition::query). We could
+                // optimize this case and only send cell timestamps, without data,
+                // for the cells which are not queried for (TODO).
+                state.result.emplace_back(partition{rows_left, freeze(m)});
+            }
+
+            return state.limit ? stop_iteration::no : stop_iteration::yes;
+        }).then([&state] {
+            return make_ready_future<reconcilable_result>(
+                reconcilable_result(state.requested_limit - state.limit, std::move(state.result)));
+        });
+    });
 }

 std::ostream& operator<<(std::ostream& out, const reconcilable_result::printer& pr) {
--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -114,26 +114,3 @@ future<reconcilable_result> mutation_query(
    const query::partition_slice& slice,
    uint32_t row_limit,
    gc_clock::time_point query_time);
-
-
-class querying_reader {
-    schema_ptr _schema;
-    const query::partition_range& _range;
-    const query::partition_slice& _slice;
-    uint32_t _requested_limit;
-    gc_clock::time_point _query_time;
-    uint32_t _limit;
-    const mutation_source& _source;
-    std::function<void(uint32_t, mutation&&)> _consumer;
-    std::experimental::optional<mutation_reader> _reader;
-public:
-    querying_reader(schema_ptr s,
-                    const mutation_source& source,
-                    const query::partition_range& range,
-                    const query::partition_slice& slice,
-                    uint32_t row_limit,
-                    gc_clock::time_point query_time,
-                    std::function<void(uint32_t, mutation&&)> consumer);
-
-    future<> read();
-};
--- a/partition_slice_builder.cc
+++ b/partition_slice_builder.cc
@@ -127,15 +127,3 @@ partition_slice_builder::reversed() {
    _options.set<query::partition_slice::option::reversed>();
    return *this;
 }
-
-partition_slice_builder&
-partition_slice_builder::without_partition_key_columns() {
-    _options.remove<query::partition_slice::option::send_partition_key>();
-    return *this;
-}
-
-partition_slice_builder&
-partition_slice_builder::without_clustering_key_columns() {
-    _options.remove<query::partition_slice::option::send_clustering_key>();
-    return *this;
-}
--- a/partition_slice_builder.hh
+++ b/partition_slice_builder.hh
@@ -50,8 +50,6 @@ public:
    partition_slice_builder& with_regular_column(bytes name);
    partition_slice_builder& with_no_regular_columns();
    partition_slice_builder& with_range(query::clustering_range range);
-    partition_slice_builder& without_partition_key_columns();
-    partition_slice_builder& without_clustering_key_columns();
    partition_slice_builder& reversed();

    query::partition_slice build();
--- a/query-result-set.cc
+++ b/query-result-set.cc
@@ -201,7 +201,7 @@ result_set::from_raw_result(schema_ptr s, const partition_slice& slice, const re

 result_set::result_set(const mutation& m) : result_set([&m] {
    auto slice = partition_slice_builder(*m.schema()).build();
-    auto qr = mutation(m).query(slice, result_request::only_result);
+    auto qr = m.query(slice);
    return result_set::from_raw_result(m.schema(), slice, qr);
 }())
 { }
--- a/query-result-set.hh
+++ b/query-result-set.hh
@@ -83,7 +83,6 @@ public:
        }
        throw null_column_value(column_name);
    }
-    const std::unordered_map<sstring, data_value>& cells() const { return _cells; }
    friend inline bool operator==(const result_set_row& x, const result_set_row& y);
    friend inline bool operator!=(const result_set_row& x, const result_set_row& y);
    friend std::ostream& operator<<(std::ostream& out, const result_set_row& row);
--- a/query-result-writer.hh
+++ b/query-result-writer.hh
@@ -37,8 +37,8 @@

 namespace query {

+
 class result::partition_writer {
-    result_request _request;
    ser::after_qr_partition__key _w;
    const partition_slice& _slice;
    // We are tasked with keeping track of the range
@@ -48,35 +48,20 @@ class result::partition_writer {
    ser::query_result__partitions& _pw;
    ser::vector_position _pos;
    bool _static_row_added = false;
-    md5_hasher& _digest;
-    md5_hasher _digest_pos;
 public:
    partition_writer(
-        result_request request,
        const partition_slice& slice,
        const clustering_row_ranges& ranges,
        ser::query_result__partitions& pw,
        ser::vector_position pos,
-        ser::after_qr_partition__key w,
-        md5_hasher& digest)
-        : _request(request)
-        , _w(std::move(w))
+        ser::after_qr_partition__key w)
+        : _w(std::move(w))
        , _slice(slice)
        , _ranges(ranges)
        , _pw(pw)
        , _pos(std::move(pos))
-        , _digest(digest)
-        , _digest_pos(digest)
    { }

-    bool requested_digest() const {
-        return _request != result_request::only_result;
-    }
-
-    bool requested_result() const {
-        return _request != result_request::only_digest;
-    }
-
    ser::after_qr_partition__key start() {
        return std::move(_w);
    }
@@ -85,7 +70,6 @@ public:
    // Can be called at any stage of writing before this element is finalized.
    // Do not use this writer after that.
    void retract() {
-        _digest = _digest_pos;
        _pw.rollback(_pos);
    }

@@ -95,22 +79,16 @@ public:
    const partition_slice& slice() const {
        return _slice;
    }
-    md5_hasher& digest() {
-        return _digest;
-    }
 };

 class result::builder {
    bytes_ostream _out;
-    md5_hasher _digest;
    const partition_slice& _slice;
    ser::query_result__partitions _w;
-    result_request _request;
 public:
-    builder(const partition_slice& slice, result_request request)
+    builder(const partition_slice& slice)
        : _slice(slice)
        , _w(ser::writer_of_query_result(_out).start_partitions())
-        , _request(request)
    { }
    builder(builder&&) = delete; // _out is captured by reference

@@ -127,26 +105,12 @@ public:
                return std::move(pw).skip_key();
            }
        }();
-        if (_request != result_request::only_result) {
-            key.feed_hash(_digest, s);
-        }
-        return partition_writer(_request, _slice, ranges, _w, std::move(pos), std::move(after_key), _digest);
+        return partition_writer(_slice, ranges, _w, std::move(pos), std::move(after_key));
    }

    result build() {
        std::move(_w).end_partitions().end_query_result();
-        switch (_request) {
-        case result_request::only_result:
-            return result(std::move(_out));
-        case result_request::only_digest: {
-            bytes_ostream buf;
-            ser::writer_of_query_result(buf).start_partitions().end_partitions().end_query_result();
-            return result(std::move(buf), result_digest(_digest.finalize_array()));
-        }
-        case result_request::result_and_digest:
-            return result(std::move(_out), result_digest(_digest.finalize_array()));
-        }
-        abort();
+        return result(std::move(_out));
    }
 };

--- a/query-result.hh
+++ b/query-result.hh
@@ -25,19 +25,9 @@
 #include <cryptopp/md5.h>
 #include "bytes_ostream.hh"
 #include "query-request.hh"
-#include "md5_hasher.hh"
-#include <experimental/optional>
-
-namespace stdx = std::experimental;

 namespace query {

-enum class result_request {
-    only_result,
-    only_digest,
-    result_and_digest,
-};
-
 class result_digest {
 public:
    static_assert(16 == CryptoPP::Weak::MD5::DIGESTSIZE, "MD5 digest size is all wrong");
@@ -95,7 +85,6 @@ public:

 class result {
    bytes_ostream _w;
-    stdx::optional<result_digest> _digest;
 public:
    class builder;
    class partition_writer;
@@ -103,7 +92,6 @@ public:

    result();
    result(bytes_ostream&& w) : _w(std::move(w)) {}
-    result(bytes_ostream&& w, stdx::optional<result_digest> d) : _w(std::move(w)), _digest(d) {}
    result(result&&) = default;
    result(const result&) = default;
    result& operator=(result&&) = default;
@@ -113,8 +101,12 @@ public:
        return _w;
    }

-    const stdx::optional<result_digest>& digest() const {
-        return _digest;
+    result_digest digest() {
+        CryptoPP::Weak::MD5 hash;
+        result_digest::type digest;
+        bytes_view v = _w.linearize();
+        hash.CalculateDigest(reinterpret_cast<unsigned char*>(digest.data()), reinterpret_cast<const unsigned char*>(v.begin()), v.size());
+        return result_digest(std::move(digest));
    }
    sstring pretty_print(schema_ptr, const query::partition_slice&) const;
 };
--- a/query.cc
+++ b/query.cc
@@ -139,17 +139,7 @@ to_partition_range(query::range<dht::token> r) {
 sstring
 result::pretty_print(schema_ptr s, const query::partition_slice& slice) const {
    std::ostringstream out;
-    out << "{ result: " << result_set::from_raw_result(s, slice, *this);
-    out << " digest: ";
-    if (_digest) {
-        out << std::hex << std::setw(2);
-        for (auto&& c : _digest->get()) {
-            out << unsigned(c) << " ";
-        }
-    } else {
-        out << "{}";
-    }
-    out << " }";
+    out << "{" << result_set::from_raw_result(s, slice, *this) << "}";
    return out.str();
 }

--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -33,7 +33,6 @@
 #include <boost/algorithm/string/classification.hpp>

 #include <cryptopp/sha.h>
-#include <seastar/core/gate.hh>

 static logging::logger logger("repair");

@@ -327,7 +326,7 @@ static future<partition_checksum> checksum_range_shard(database &db,
        const ::range<dht::token>& range) {
    auto& cf = db.find_column_family(keyspace_name, cf_name);
    return do_with(query::to_partition_range(range), [&cf] (const auto& partition_range) {
-        return do_with(cf.make_reader(cf.schema(), partition_range, service::get_local_streaming_read_priority()), partition_checksum(),
+        return do_with(cf.make_reader(cf.schema(), partition_range, service::get_local_mutation_stream_priority()), partition_checksum(),
            [] (auto& reader, auto& checksum) {
            return repeat([&reader, &checksum] () {
                return reader().then([&checksum] (auto mopt) {
@@ -416,21 +415,6 @@ static void split_and_add(std::vector<::range<dht::token>>& ranges,
    ranges.push_back(halves.first);
    ranges.push_back(halves.second);
 }
-// We don't need to wait for one checksum to finish before we start the
-// next, but doing too many of these operations in parallel also doesn't
-// make sense, so we limit the number of concurrent ongoing checksum
-// requests with a semaphore.
-//
-// FIXME: We shouldn't use a magic number here, but rather bind it to
-// some resource. Otherwise we'll be doing too little in some machines,
-// and too much in others.
-//
-// FIXME: This would be better of in a repair service, or even a per-shard
-// repair instance holding all repair state. However, since we are anyway
-// considering ditching those semaphores for a more fine grained resource-based
-// solution, let's do the simplest thing here and change it later
-constexpr int parallelism = 100;
-static thread_local semaphore parallelism_semaphore(parallelism);

 // Repair a single cf in a single local range.
 // Comparable to RepairJob in Origin.
@@ -477,14 +461,17 @@ static future<> repair_cf_range(seastar::sharded<database>& db,
        split_and_add(ranges, range, estimated_partitions, 100);
    }

-    return do_with(seastar::gate(), true, std::move(keyspace), std::move(cf), std::move(ranges),
-        [&db, &neighbors] (auto& completion, auto& success, const auto& keyspace, const auto& cf, const auto& ranges) {
-        return do_for_each(ranges, [&completion, &success, &db, &neighbors, &keyspace, &cf]
+    // We don't need to wait for one checksum to finish before we start the
+    // next, but doing too many of these operations in parallel also doesn't
+    // make sense, so we limit the number of concurrent ongoing checksum
+    // requests with a semaphore.
+    constexpr int parallelism = 100;
+    return do_with(semaphore(parallelism), true, std::move(keyspace), std::move(cf), std::move(ranges),
+        [&db, &neighbors, parallelism] (auto& sem, auto& success, const auto& keyspace, const auto& cf, const auto& ranges) {
+        return do_for_each(ranges, [&sem, &success, &db, &neighbors, &keyspace, &cf]
                           (const auto& range) {
-
            check_in_shutdown();
-            return parallelism_semaphore.wait(1).then([&completion, &success, &db, &neighbors, &keyspace, &cf, &range] {
-
+            return sem.wait(1).then([&sem, &success, &db, &neighbors, &keyspace, &cf, &range] {
                // Ask this node, and all neighbors, to calculate checksums in
                // this range. When all are done, compare the results, and if
                // there are any differences, sync the content of this range.
@@ -496,8 +483,6 @@ static future<> repair_cf_range(seastar::sharded<database>& db,
                            net::get_local_messaging_service().send_repair_checksum_range(
                                    net::msg_addr{neighbor},keyspace, cf, range));
                }
-
-                completion.enter();
                when_all(checksums.begin(), checksums.end()).then(
                        [&db, &keyspace, &cf, &range, &neighbors, &success]
                        (std::vector<future<partition_checksum>> checksums) {
@@ -543,13 +528,10 @@ static future<> repair_cf_range(seastar::sharded<database>& db,
                    // tell the caller.
                    success = false;
                    logger.warn("Failed sync of range {}: {}", range, eptr);
-                }).finally([&completion] {
-                    parallelism_semaphore.signal(1);
-                    completion.leave(); // notify do_for_each that we're done
-                });
+                }).finally([&sem] { sem.signal(1); });
            });
-        }).finally([&success, &completion] {
-            return completion.close().then([&success] {
+        }).finally([&sem, &success, parallelism] {
+            return sem.wait(parallelism).then([&success] {
                return success ? make_ready_future<>() :
                        make_exception_future<>(std::runtime_error("Checksum or sync of partial range failed"));
            });
--- a/reversibly_mergeable.hh
+++ b/reversibly_mergeable.hh
@@ -1,69 +0,0 @@
-/*
- * Copyright (C) 2016 Cloudius Systems, Ltd.
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "utils/allocation_strategy.hh"
-#include <seastar/util/defer.hh>
-
-//
-// ~~ Definitions ~~
-//
-// Mergeable type is a type which has an associated "apply" binary operation (T x T -> T)
-// which forms a commutative semigroup with instances of that type.
-//
-// ReversiblyMergeable type is a Mergeable type which has two binary operations associated,
-// "apply_reversibly" and "revert", both working on objects of that type (T x T -> T x T)
-// with the following properties:
-//
-//   apply_reversibly(x, y) = (x', y')
-//   revert(x', y') = (x'', y'')
-//
-//   x'  = apply(x, y)
-//   x'' = x
-//   apply(x'', y'') = apply(x, y)
-//
-// Note that it is not guaranteed that y'' = y and the state of y' is unspecified.
-//
-// ~~ API ~~
-//
-// "apply_reversibly" and "revert" are usually implemented as instance methods or functions
-// mutating both arguments to store the result of the operation in them.
-//
-// "revert" is not allowed to throw. If "apply_reversibly" throws the objects on which it operates
-// are left in valid states, with guarantees the same as if a successful apply_reversibly() was
-// followed by revert().
-//
-
-
-template<typename T>
-struct default_reversible_applier {
-    void operator()(T& dst, T& src) const {
-        dst.apply_reversibly(src);
-    }
-};
-
-template<typename T>
-struct default_reverter {
-    void operator()(T& dst, T& src) const noexcept {
-        dst.revert(src);
-    }
-};
--- a/schema_registry.cc
+++ b/schema_registry.cc
@@ -203,15 +203,12 @@ future<> schema_registry_entry::maybe_sync(std::function<future<>()> syncer) {
            return make_ready_future<>();
        case schema_registry_entry::sync_state::SYNCING:
            return _synced_future;
-        case schema_registry_entry::sync_state::NOT_SYNCED: {
+        case schema_registry_entry::sync_state::NOT_SYNCED:
            logger.debug("Syncing {}", _version);
            _synced_promise = {};
-            auto f = do_with(std::move(syncer), [] (auto& syncer) {
+            do_with(std::move(syncer), [] (auto& syncer) {
                return syncer();
-            });
-            _synced_future = _synced_promise.get_future();
-            _sync_state = schema_registry_entry::sync_state::SYNCING;
-            f.then_wrapped([this, self = shared_from_this()] (auto&& f) {
+            }).then_wrapped([this, self = shared_from_this()] (auto&& f) {
                if (_sync_state != sync_state::SYNCING) {
                    return;
                }
@@ -225,8 +222,9 @@ future<> schema_registry_entry::maybe_sync(std::function<future<>()> syncer) {
                    _synced_promise.set_value();
                }
            });
+            _synced_future = _synced_promise.get_future();
+            _sync_state = schema_registry_entry::sync_state::SYNCING;
            return _synced_future;
-        }
        default:
            assert(0);
    }
--- a/scripts/scylla_install_pkg
+++ b/scripts/scylla_install_pkg
@@ -36,29 +36,14 @@ done
 . /etc/os-release

 if [ "$ID" = "ubuntu" ]; then
-    echo "#!/bin/sh" >> /usr/sbin/policy-rc.d
-    echo "exit 101" >> /usr/sbin/policy-rc.d
-    chmod +x /usr/sbin/policy-rc.d
-    cp /etc/hosts /etc/hosts.orig
-    echo 127.0.0.1 `hostname` >> /etc/hosts
-    if [ $UNSTABLE -eq 0 ]; then
-        echo "deb http://s3.amazonaws.com/downloads.scylladb.com/deb/ubuntu trusty/scylladb multiverse" > /etc/apt/sources.list.d/scylla.list
-    else
-        echo "deb https://s3.amazonaws.com/downloads.scylladb.com/deb/unstable/ubuntu/master/latest trusty/scylladb multiverse" > /etc/apt/sources.list.d/scylla.list
-    fi
-    apt-get update
    if [ "$LOCAL_PKG" = "" ]; then
+        echo "deb http://s3.amazonaws.com/downloads.scylladb.com/deb/ubuntu trusty/scylladb multiverse" > /etc/apt/sources.list.d/scylla.list
+        apt-get update
        apt-get install -y --force-yes scylla-server scylla-jmx scylla-tools
    else
-        if [ ! -f /usr/bin/gdebi ]; then
-            apt-get install -y --force-yes gdebi-core
-        fi
-        echo Y | gdebi $LOCAL_PKG/scylla-server*.deb
-        echo Y | gdebi $LOCAL_PKG/scylla-jmx*.deb
-        echo Y | gdebi $LOCAL_PKG/scylla-tools*.deb
+        apt-get install -y --force-yes gdebi-core
+        gdebi $LOCAL_PKG/scylla-server*.deb $LOCAL_PKG/scylla-jmx*.deb $LOCAL_PKG/scylla-tools*.deb
    fi
-    mv /etc/hosts.orig /etc/hosts
-    rm /usr/sbin/policy-rc.d
 else
    if [ "$ID" = "fedora" ]; then
        if [ $UNSTABLE -eq 0 ]; then
--- a/2
+++ b/2
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -94,9 +94,6 @@ void migration_manager::init_messaging_service()
            // keep local proxy alive
        });
    });
-    ms.register_schema_check([] {
-        return make_ready_future<utils::UUID>(service::get_local_storage_service().db().local().get_version());
-    });
 }

 void migration_manager::uninit_messaging_service()
@@ -104,7 +101,6 @@ void migration_manager::uninit_messaging_service()
    auto& ms = net::get_local_messaging_service();
    ms.unregister_migration_request();
    ms.unregister_definitions_update();
-    ms.unregister_schema_check();
 }

 void migration_manager::register_listener(migration_listener* listener)
@@ -686,28 +682,20 @@ public static class MigrationsSerializer implements IVersionedSerializer<Collect
 //
 // The endpoint is the node from which 's' originated.
 //
+// FIXME: Avoid the sync if the source was/is synced by schema_tables::merge_schema().
 static future<> maybe_sync(const schema_ptr& s, net::messaging_service::msg_addr endpoint) {
    if (s->is_synced()) {
        return make_ready_future<>();
    }

-    return s->registry_entry()->maybe_sync([s, endpoint] {
-        auto merge = [gs = global_schema_ptr(s), endpoint] {
-            schema_ptr s = gs.get();
+    // Serialize schema sync by always doing it on shard 0.
+    return smp::submit_to(0, [gs = global_schema_ptr(s), endpoint] {
+        schema_ptr s = gs.get();
+        schema_registry_entry& e = *s->registry_entry();
+        return e.maybe_sync([endpoint, s] {
            logger.debug("Syncing schema of {}.{} (v={}) with {}", s->ks_name(), s->cf_name(), s->version(), endpoint);
            return get_local_migration_manager().merge_schema_from(endpoint);
-        };
-
-        // Serialize schema sync by always doing it on shard 0.
-        if (engine().cpu_id() == 0) {
-            return merge();
-        } else {
-            return smp::submit_to(0, [gs = global_schema_ptr(s), endpoint, merge] {
-                schema_ptr s = gs.get();
-                schema_registry_entry& e = *s->registry_entry();
-                return e.maybe_sync(merge);
-            });
-        }
+        });
    });
 }

--- a/service/priority_manager.hh
+++ b/service/priority_manager.hh
@@ -26,8 +26,7 @@ namespace service {
 class priority_manager {
    ::io_priority_class _commitlog_priority;
    ::io_priority_class _mt_flush_priority;
-    ::io_priority_class _stream_read_priority;
-    ::io_priority_class _stream_write_priority;
+    ::io_priority_class _mut_stream_priority;
    ::io_priority_class _sstable_query_read;
    ::io_priority_class _compaction_priority;

@@ -43,13 +42,8 @@ public:
    }

    const ::io_priority_class&
-    streaming_read_priority() {
-        return _stream_read_priority;
-    }
-
-    const ::io_priority_class&
-    streaming_write_priority() {
-        return _stream_write_priority;
+    mutation_stream_priority() {
+        return _mut_stream_priority;
    }

    const ::io_priority_class&
@@ -65,8 +59,7 @@ public:
    priority_manager()
        : _commitlog_priority(engine().register_one_priority_class("commitlog", 100))
        , _mt_flush_priority(engine().register_one_priority_class("memtable_flush", 100))
-        , _stream_read_priority(engine().register_one_priority_class("streaming_read", 20))
-        , _stream_write_priority(engine().register_one_priority_class("streaming_write", 20))
+        , _mut_stream_priority(engine().register_one_priority_class("streaming", 100))
        , _sstable_query_read(engine().register_one_priority_class("query", 100))
        , _compaction_priority(engine().register_one_priority_class("compaction", 100))

@@ -85,13 +78,8 @@ get_local_memtable_flush_priority() {
 }

 const inline ::io_priority_class&
-get_local_streaming_read_priority() {
-    return get_local_priority_manager().streaming_read_priority();
-}
-
-const inline ::io_priority_class&
-get_local_streaming_write_priority() {
-    return get_local_priority_manager().streaming_write_priority();
+get_local_mutation_stream_priority() {
+    return get_local_priority_manager().mutation_stream_priority();
 }

 const inline ::io_priority_class&
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -374,11 +374,6 @@ storage_proxy::storage_proxy(distributed<database>& db) : _db(db) {
                , "queue_length", "background reads")
                , scollectd::make_typed(scollectd::data_type::GAUGE, _stats.background_reads)
        ),
-        scollectd::add_polled_metric(scollectd::type_instance_id("storage_proxy"
-                , scollectd::per_cpu_plugin_instance
-                , "total_operations", "read retries")
-                , scollectd::make_typed(scollectd::data_type::DERIVE, _stats.read_retries)
-        ),
        scollectd::add_polled_metric(scollectd::type_instance_id("storage_proxy"
                , scollectd::per_cpu_plugin_instance
                , "total_operations", "write timeouts")
@@ -835,15 +830,6 @@ storage_proxy::mutate_locally(std::vector<mutation> mutations) {
    });
 }

-future<>
-storage_proxy::mutate_streaming_mutation(const schema_ptr& s, const frozen_mutation& m) {
-    auto shard = _db.local().shard_of(m);
-    return _db.invoke_on(shard, [&m, gs = global_schema_ptr(s)] (database& db) mutable -> future<> {
-        return db.apply_streaming_mutation(gs, m);
-    });
-}
-
-
 /**
 * Helper for create_write_response_handler, shared across mutate/mutate_atomically.
 * Both methods do roughly the same thing, with the latter intermixing batch log ops
@@ -885,7 +871,7 @@ storage_proxy::create_write_response_handler(const mutation& m, db::consistency_
    std::partition_copy(all.begin(), all.end(), std::inserter(live_endpoints, live_endpoints.begin()), std::back_inserter(dead_endpoints),
            std::bind1st(std::mem_fn(&gms::failure_detector::is_alive), &gms::get_local_failure_detector()));

-    db::assure_sufficient_live_nodes(cl, ks, live_endpoints, pending_endpoints);
+    db::assure_sufficient_live_nodes(cl, ks, live_endpoints);

    return create_write_response_handler(m.schema(), ks, cl, type, freeze(m), std::move(live_endpoints), pending_endpoints, std::move(dead_endpoints));
 }
@@ -1445,8 +1431,6 @@ public:
            std::rethrow_exception(eptr);
        } catch (rpc::closed_error&) {
            return; // do not report connection closed exception, gossiper does that
-        } catch (rpc::timeout_error&) {
-            return; // do not report timeouts, the whole operation will timeout and be reported
        } catch(std::exception& e) {
            why = e.what();
        } catch(...) {
@@ -1482,7 +1466,7 @@ public:
    void add_data(gms::inet_address from, foreign_ptr<lw_shared_ptr<query::result>> result) {
        if (!_timedout) {
            // if only one target was queried digest_check() will be skipped so we can also skip digest calculation
-            _digest_results.emplace_back(_targets_count == 1 ? query::result_digest() : *result->digest());
+            _digest_results.emplace_back(_targets_count == 1 ? query::result_digest() : result->digest());
            if (!_data_result) {
                _data_result = std::move(result);
            }
@@ -1546,13 +1530,7 @@ class data_read_resolver : public abstract_read_resolver {
        partition par;
        version(gms::inet_address from_, partition par_) : from(std::move(from_)), par(std::move(par_)) {}
    };
-    struct mutation_and_live_row_count {
-        mutation mut;
-        size_t live_row_count;
-    };
-    using row_address = std::pair<dht::decorated_key, stdx::optional<clustering_key>>;

-    size_t _total_live_count = 0;
    uint32_t _max_live_count = 0;
    std::vector<reply> _data_results;
    std::unordered_map<gms::inet_address, std::vector<mutation>> _diffs;
@@ -1565,114 +1543,6 @@ private:
        return _data_results.size();
    }

-    std::vector<row_address> get_last_rows(schema_ptr schema, const query::read_command& cmd) {
-        class get_last_row final : public mutation_partition_visitor {
-            stdx::optional<clustering_key> _last_ck;
-            bool _is_reversed;
-        public:
-            explicit get_last_row(bool is_reversed) : _is_reversed(is_reversed) { }
-
-            virtual void accept_partition_tombstone(tombstone) override { }
-            virtual void accept_static_cell(column_id, atomic_cell_view) override { }
-            virtual void accept_static_cell(column_id, collection_mutation_view) override { }
-            virtual void accept_row_tombstone(clustering_key_prefix_view, tombstone) override { }
-            virtual void accept_row(clustering_key_view key, tombstone, const row_marker&) override {
-                if (!_is_reversed || !_last_ck) {
-                    _last_ck = clustering_key(key);
-                }
-            }
-            virtual void accept_row_cell(column_id id, atomic_cell_view) override { }
-            virtual void accept_row_cell(column_id id, collection_mutation_view) override { }
-
-            auto last_clustering_key() {
-                return std::move(_last_ck);
-            }
-        };
-
-        std::vector<row_address> vec;
-        vec.reserve(_data_results.size());
-        for (auto& reply : _data_results) {
-            const auto& result = *reply.result;
-            if (result.row_count() < cmd.row_limit) {
-                continue;
-            }
-            assert(!result.partitions().empty());
-            auto& p = result.partitions().back();
-
-            auto is_reversed = cmd.slice.options.contains(query::partition_slice::option::reversed);
-            get_last_row glr(is_reversed);
-            p.mut().partition().accept(*schema, glr);
-            vec.emplace_back(p.mut().decorated_key(*schema), std::move(glr.last_clustering_key()));
-        }
-        return vec;
-    }
-
-    template<typename ReconciledPartitions>
-    bool got_incomplete_information(schema_ptr schema, const query::read_command& cmd, uint32_t original_row_limit, const ReconciledPartitions& rp, const std::vector<row_address>& rows) const {
-        // We need to check whether the reconciled result contains all information from all available
-        // replicas. It is possible that some of the nodes have returned less rows (because the limit
-        // was set and they had some tombstones missing) than the others. In such cases we cannot just
-        // merge all results and return that to the client as the replicas that returned less row
-        // may have newer data for the rows they did not send than any other node in the cluster.
-        //
-        // This function is responsible for detecting whether such problem may happen. We get partition
-        // and clustering keys of the last row that is going to be returned to the client and check if
-        // it is in range of rows returned by each replicas that returned as many rows as they were
-        // asked for (if a replica returned less rows it means it returned everything it has).
-        auto is_reversed = cmd.slice.options.contains(query::partition_slice::option::reversed);
-
-        auto last_row = [&] {
-            auto limit = original_row_limit;
-            for (auto&& m_a_rc : rp) {
-                auto row_count = m_a_rc.live_row_count;
-                if (row_count < limit) {
-                    limit -= row_count;
-                    continue;
-                }
-                const auto& m = m_a_rc.mut;
-                auto mp = m.partition();
-                auto&& ranges = cmd.slice.row_ranges(*schema, m.key());
-                auto rc = mp.compact_for_query(*schema, cmd.timestamp, ranges, is_reversed, limit);
-
-                assert(rc == limit);
-                stdx::optional<clustering_key> ck;
-                if (!mp.clustered_rows().empty()) {
-                    if (is_reversed) {
-                        ck = mp.clustered_rows().begin()->key();
-                    } else {
-                        ck = mp.clustered_rows().rbegin()->key();
-                    }
-                }
-                return std::make_pair(m.decorated_key(), ck);
-            }
-            abort();
-        }();
-
-        clustering_key::less_compare ck_compare(*schema);
-        for (auto&& row : rows) {
-            auto pk_compare = row.first.tri_compare(*schema, last_row.first);
-            if (pk_compare < 0) {
-                return true;
-            } else if (pk_compare > 0) {
-                continue;
-            }
-            if (!last_row.second) {
-                continue;
-            } else if (!row.second) {
-                return true;
-            }
-            if (is_reversed) {
-                if (ck_compare(*last_row.second, *row.second)) {
-                    return true;
-                }
-            } else {
-                if (ck_compare(*row.second, *last_row.second)) {
-                    return true;
-                }
-            }
-        }
-        return false;
-    }
 public:
    data_read_resolver(db::consistency_level cl, size_t targets_count, std::chrono::steady_clock::time_point timeout) : abstract_read_resolver(cl, targets_count, timeout) {
        _data_results.reserve(targets_count);
@@ -1690,12 +1560,10 @@ public:
    uint32_t max_live_count() const {
        return _max_live_count;
    }
-    stdx::optional<reconcilable_result> resolve(schema_ptr schema, const query::read_command& cmd, uint32_t original_row_limit) {
+    reconcilable_result resolve(schema_ptr schema) {
        assert(_data_results.size());
        const auto& s = *schema;

-        auto last_rows = get_last_rows(schema, cmd);
-
        // return true if lh > rh
        auto cmp = [&s](reply& lh, reply& rh) {
            if (lh.result->partitions().size() == 0) {
@@ -1735,22 +1603,20 @@ public:
            }
        } while(true);

-        std::vector<mutation_and_live_row_count> reconciled_partitions;
+        std::vector<mutation> reconciled_partitions;
        reconciled_partitions.reserve(versions.size());

        // reconcile all versions
        boost::range::transform(boost::make_iterator_range(versions.begin(), versions.end()), std::back_inserter(reconciled_partitions), [this, schema] (std::vector<version>& v) {
-            auto m = boost::accumulate(v, mutation(v.front().par.mut().key(*schema), schema), [this, schema] (mutation& m, const version& ver) {
+            return boost::accumulate(v, mutation(v.front().par.mut().key(*schema), schema), [this, schema] (mutation& m, const version& ver) {
                m.partition().apply(*schema, ver.par.mut().partition(), *schema);
                return std::move(m);
            });
-            auto live_row_count = m.live_row_count();
-            return mutation_and_live_row_count { std::move(m), live_row_count };
        });

        // calculate differences
        for (auto z : boost::combine(versions, reconciled_partitions)) {
-            const mutation& m = z.get<1>().mut;
+            const mutation& m = z.get<1>();
            for (const version& v : z.get<0>()) {
                auto diff = m.partition().difference(schema, v.par.mut().unfreeze(schema).partition());
                if (!diff.empty()) {
@@ -1759,24 +1625,13 @@ public:
            }
        }

-        _total_live_count = boost::accumulate(reconciled_partitions, size_t(0), [] (size_t count, const auto& m_a_rc) {
-            return count + m_a_rc.live_row_count;
-        });
-        if (!_diffs.empty()) {
-            if (_total_live_count >= original_row_limit && got_incomplete_information(schema, cmd, original_row_limit,
-                                                                                      reconciled_partitions | boost::adaptors::reversed, last_rows)) {
-                return {};
-            }
-        }
-
        // build reconcilable_result from reconciled data
        // traverse backwards since large keys are at the start
        std::vector<partition> vec;
        using acc_type = std::pair<uint32_t, std::reference_wrapper<std::vector<partition>>>;
        acc_type acc(0, std::ref(vec));
-        auto r = boost::accumulate(reconciled_partitions | boost::adaptors::reversed, acc, [] (acc_type& a, const mutation_and_live_row_count& m_a_rc) {
-            const auto& m = m_a_rc.mut;
-            auto count = m_a_rc.live_row_count;
+        auto r = boost::accumulate(reconciled_partitions | boost::adaptors::reversed, acc, [] (acc_type& a, const mutation& m) {
+            auto count = m.live_row_count();
            a.first += count;
            a.second.get().emplace_back(partition(count, freeze(m)));
            return a;
@@ -1784,12 +1639,11 @@ public:

        return reconcilable_result(r.first, std::move(r.second.get()));
    }
-    auto total_live_count() const {
-        return _total_live_count;
-    }
+
    auto get_diffs_for_repair() {
        return std::move(_diffs);
    }
+
 };

 class abstract_read_executor : public enable_shared_from_this<abstract_read_executor> {
@@ -1797,7 +1651,6 @@ protected:
    using targets_iterator = std::vector<gms::inet_address>::iterator;
    using digest_resolver_ptr = ::shared_ptr<digest_read_resolver>;
    using data_resolver_ptr = ::shared_ptr<data_read_resolver>;
-    using clock_type = std::chrono::steady_clock;

    schema_ptr _schema;
    shared_ptr<storage_proxy> _proxy;
@@ -1820,37 +1673,37 @@ public:
    };

 protected:
-    future<foreign_ptr<lw_shared_ptr<reconcilable_result>>> make_mutation_data_request(lw_shared_ptr<query::read_command> cmd, gms::inet_address ep, clock_type::time_point timeout) {
+    future<foreign_ptr<lw_shared_ptr<reconcilable_result>>> make_mutation_data_request(lw_shared_ptr<query::read_command> cmd, gms::inet_address ep) {
        if (is_me(ep)) {
            return _proxy->query_mutations_locally(_schema, cmd, _partition_range);
        } else {
            auto& ms = net::get_local_messaging_service();
-            return ms.send_read_mutation_data(net::messaging_service::msg_addr{ep, 0}, timeout, *cmd, _partition_range).then([this](reconcilable_result&& result) {
+            return ms.send_read_mutation_data(net::messaging_service::msg_addr{ep, 0}, *cmd, _partition_range).then([this](reconcilable_result&& result) {
                    return make_foreign(::make_lw_shared<reconcilable_result>(std::move(result)));
            });
        }
    }
-    future<foreign_ptr<lw_shared_ptr<query::result>>> make_data_request(gms::inet_address ep, clock_type::time_point timeout) {
+    future<foreign_ptr<lw_shared_ptr<query::result>>> make_data_request(gms::inet_address ep) {
        if (is_me(ep)) {
            return _proxy->query_singular_local(_schema, _cmd, _partition_range);
        } else {
            auto& ms = net::get_local_messaging_service();
-            return ms.send_read_data(net::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range).then([this](query::result&& result) {
+            return ms.send_read_data(net::messaging_service::msg_addr{ep, 0}, *_cmd, _partition_range).then([this](query::result&& result) {
                return make_foreign(::make_lw_shared<query::result>(std::move(result)));
            });
        }
    }
-    future<query::result_digest> make_digest_request(gms::inet_address ep, clock_type::time_point timeout) {
+    future<query::result_digest> make_digest_request(gms::inet_address ep) {
        if (is_me(ep)) {
            return _proxy->query_singular_local_digest(_schema, _cmd, _partition_range);
        } else {
            auto& ms = net::get_local_messaging_service();
-            return ms.send_read_digest(net::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range);
+            return ms.send_read_digest(net::messaging_service::msg_addr{ep, 0}, *_cmd, _partition_range);
        }
    }
-    future<> make_mutation_data_requests(lw_shared_ptr<query::read_command> cmd, data_resolver_ptr resolver, targets_iterator begin, targets_iterator end, clock_type::time_point timeout) {
-        return parallel_for_each(begin, end, [this, &cmd, resolver = std::move(resolver), timeout] (gms::inet_address ep) {
-            return make_mutation_data_request(cmd, ep, timeout).then_wrapped([resolver, ep] (future<foreign_ptr<lw_shared_ptr<reconcilable_result>>> f) {
+    future<> make_mutation_data_requests(lw_shared_ptr<query::read_command> cmd, data_resolver_ptr resolver, targets_iterator begin, targets_iterator end) {
+        return parallel_for_each(begin, end, [this, &cmd, resolver = std::move(resolver)] (gms::inet_address ep) {
+            return make_mutation_data_request(cmd, ep).then_wrapped([resolver, ep] (future<foreign_ptr<lw_shared_ptr<reconcilable_result>>> f) {
                try {
                    resolver->add_mutate_data(ep, f.get0());
                } catch(...) {
@@ -1859,9 +1712,9 @@ protected:
            });
        });
    }
-    future<> make_data_requests(digest_resolver_ptr resolver, targets_iterator begin, targets_iterator end, clock_type::time_point timeout) {
-        return parallel_for_each(begin, end, [this, resolver = std::move(resolver), timeout] (gms::inet_address ep) {
-            return make_data_request(ep, timeout).then_wrapped([resolver, ep] (future<foreign_ptr<lw_shared_ptr<query::result>>> f) {
+    future<> make_data_requests(digest_resolver_ptr resolver, targets_iterator begin, targets_iterator end) {
+        return parallel_for_each(begin, end, [this, resolver = std::move(resolver)] (gms::inet_address ep) {
+            return make_data_request(ep).then_wrapped([resolver, ep] (future<foreign_ptr<lw_shared_ptr<query::result>>> f) {
                try {
                    resolver->add_data(ep, f.get0());
                } catch(...) {
@@ -1870,9 +1723,9 @@ protected:
            });
        });
    }
-    future<> make_digest_requests(digest_resolver_ptr resolver, targets_iterator begin, targets_iterator end, clock_type::time_point timeout) {
-        return parallel_for_each(begin, end, [this, resolver = std::move(resolver), timeout] (gms::inet_address ep) {
-            return make_digest_request(ep, timeout).then_wrapped([resolver, ep] (future<query::result_digest> f) {
+    future<> make_digest_requests(digest_resolver_ptr resolver, targets_iterator begin, targets_iterator end) {
+        return parallel_for_each(begin, end, [this, resolver = std::move(resolver)] (gms::inet_address ep) {
+            return make_digest_request(ep).then_wrapped([resolver, ep] (future<query::result_digest> f) {
                try {
                    resolver->add_digest(ep, f.get0());
                } catch(...) {
@@ -1881,10 +1734,10 @@ protected:
            });
        });
    }
-    virtual future<> make_requests(digest_resolver_ptr resolver, clock_type::time_point timeout) {
+    virtual future<> make_requests(digest_resolver_ptr resolver) {
        resolver->add_wait_targets(_targets.size());
-        return when_all(make_data_requests(resolver, _targets.begin(), _targets.begin() + 1, timeout),
-                        make_digest_requests(resolver, _targets.begin() + 1, _targets.end(), timeout)).discard_result();
+        return when_all(make_data_requests(resolver, _targets.begin(), _targets.begin() + 1),
+                        make_digest_requests(resolver, _targets.begin() + 1, _targets.end())).discard_result();
    }
    virtual void got_cl() {}
    uint32_t original_row_limit() const {
@@ -1894,18 +1747,18 @@ protected:
        data_resolver_ptr data_resolver = ::make_shared<data_read_resolver>(cl, _targets.size(), timeout);
        auto exec = shared_from_this();

-        make_mutation_data_requests(cmd, data_resolver, _targets.begin(), _targets.end(), timeout).finally([exec]{});
+        make_mutation_data_requests(cmd, data_resolver, _targets.begin(), _targets.end()).finally([exec]{});

        data_resolver->done().then_wrapped([this, exec, data_resolver, cmd = std::move(cmd), cl, timeout] (future<> f) {
            try {
                f.get();
-                auto rr_opt = data_resolver->resolve(_schema, *cmd, original_row_limit()); // reconciliation happens here
+                auto rr = data_resolver->resolve(_schema); // reconciliation happens here

                // We generate a retry if at least one node reply with count live columns but after merge we have less
                // than the total number of column we are interested in (which may be < count on a retry).
                // So in particular, if no host returned count live columns, we know it's not a short read.
-                if (rr_opt && (data_resolver->max_live_count() < cmd->row_limit || rr_opt->row_count() >= original_row_limit())) {
-                    auto result = ::make_foreign(::make_lw_shared(to_data_query_result(std::move(*rr_opt), _schema, _cmd->slice)));
+                if (data_resolver->max_live_count() < cmd->row_limit || rr.row_count() >= original_row_limit()) {
+                    auto result = ::make_foreign(::make_lw_shared(to_data_query_result(std::move(rr), _schema, _cmd->slice)));
                    // wait for write to complete before returning result to prevent multiple concurrent read requests to
                    // trigger repair multiple times and to prevent quorum read to return an old value, even after a quorum
                    // another read had returned a newer value (but the newer value had not yet been sent to the other replicas)
@@ -1922,13 +1775,12 @@ protected:
                        }
                    });
                } else {
-                    _proxy->_stats.read_retries++;
                    _retry_cmd = make_lw_shared<query::read_command>(*cmd);
-                    // We asked t (= _cmd->row_limit) live columns and got l (=data_resolver->total_live_count) ones.
+                    // We asked t (= _cmd->row_limit) live columns and got l (=rr.row_count) ones.
                    // From that, we can estimate that on this row, for x requested
                    // columns, only l/t end up live after reconciliation. So for next
                    // round we want to ask x column so that x * (l/t) == t, i.e. x = t^2/l.
-                    _retry_cmd->row_limit = data_resolver->total_live_count() == 0 ? cmd->row_limit + 1 : ((cmd->row_limit * cmd->row_limit) / data_resolver->total_live_count()) + 1;
+                    _retry_cmd->row_limit = rr.row_count() == 0 ? cmd->row_limit + 1 : ((cmd->row_limit * cmd->row_limit) / rr.row_count()) + 1;
                    reconcile(cl, timeout, _retry_cmd);
                }
            } catch(read_timeout_exception& ex) {
@@ -1945,12 +1797,11 @@ public:
        digest_resolver_ptr digest_resolver = ::make_shared<digest_read_resolver>(_cl, _block_for, timeout);
        auto exec = shared_from_this();

-        make_requests(digest_resolver, timeout).finally([exec]() {
+        make_requests(digest_resolver).finally([exec]() {
            // hold on to executor until all queries are complete
        });

-        digest_resolver->has_cl().then_wrapped([exec, digest_resolver, timeout] (future<foreign_ptr<lw_shared_ptr<query::result>>, bool> f) mutable {
-            bool background_repair_check = false;
+        digest_resolver->has_cl().then_wrapped([exec, digest_resolver, timeout] (future<foreign_ptr<lw_shared_ptr<query::result>>, bool> f) {
            try {
                exec->got_cl();

@@ -1960,8 +1811,26 @@ public:

                if (digests_match) {
                    exec->_result_promise.set_value(std::move(result));
+                    auto done = digest_resolver->done();
                    if (exec->_block_for < exec->_targets.size()) { // if there are more targets then needed for cl, check digest in background
-                        background_repair_check = true;
+                        exec->_proxy->_stats.background_reads++;
+                        done.then_wrapped([exec, digest_resolver, timeout] (future<>&& f){
+                            if (f.failed()) {
+                                f.ignore_ready_future(); // ignore all exception besides digest mismatch during background check
+                            } else {
+                                if (!digest_resolver->digests_match()) {
+                                    exec->_proxy->_stats.read_repair_repaired_background++;
+                                    exec->_result_promise = promise<foreign_ptr<lw_shared_ptr<query::result>>>();
+                                    exec->reconcile(exec->_cl, timeout);
+                                    exec->_result_promise.get_future().then_wrapped([exec] (future<foreign_ptr<lw_shared_ptr<query::result>>> f) {
+                                        f.ignore_ready_future(); // ignore any failures during background repair
+                                        exec->_proxy->_stats.background_reads--;
+                                    });
+                                } else {
+                                    exec->_proxy->_stats.background_reads--;
+                                }
+                            }
+                        });
                    }
                } else { // digest missmatch
                    exec->reconcile(exec->_cl, timeout);
@@ -1970,22 +1839,6 @@ public:
            } catch (read_timeout_exception& ex) {
                exec->_result_promise.set_exception(ex);
            }
-
-            exec->_proxy->_stats.background_reads++;
-            digest_resolver->done().then([exec, digest_resolver, timeout, background_repair_check] () mutable {
-                if (background_repair_check && !digest_resolver->digests_match()) {
-                    exec->_proxy->_stats.read_repair_repaired_background++;
-                    exec->_result_promise = promise<foreign_ptr<lw_shared_ptr<query::result>>>();
-                    exec->reconcile(exec->_cl, timeout);
-                    return exec->_result_promise.get_future().discard_result();
-                } else {
-                    return make_ready_future<>();
-                }
-            }).handle_exception([] (std::exception_ptr eptr) {
-                // ignore any failures during background repair
-            }).then([exec] {
-                exec->_proxy->_stats.background_reads--;
-            });
        });

        return _result_promise.get_future();
@@ -2001,10 +1854,10 @@ public:
 class always_speculating_read_executor : public abstract_read_executor {
 public:
    using abstract_read_executor::abstract_read_executor;
-    virtual future<> make_requests(digest_resolver_ptr resolver, std::chrono::steady_clock::time_point timeout) {
+    virtual future<> make_requests(digest_resolver_ptr resolver) {
        resolver->add_wait_targets(_targets.size());
-        return when_all(make_data_requests(resolver, _targets.begin(), _targets.begin() + 2, timeout),
-                        make_digest_requests(resolver, _targets.begin() + 2, _targets.end(), timeout)).discard_result();
+        return when_all(make_data_requests(resolver, _targets.begin(), _targets.begin() + 2),
+                        make_digest_requests(resolver, _targets.begin() + 2, _targets.end())).discard_result();
    }
 };

@@ -2013,19 +1866,19 @@ class speculating_read_executor : public abstract_read_executor {
    timer<> _speculate_timer;
 public:
    using abstract_read_executor::abstract_read_executor;
-    virtual future<> make_requests(digest_resolver_ptr resolver, std::chrono::steady_clock::time_point timeout) {
-        _speculate_timer.set_callback([this, resolver, timeout] {
+    virtual future<> make_requests(digest_resolver_ptr resolver) {
+        _speculate_timer.set_callback([this, resolver] {
            if (!resolver->is_completed()) { // at the time the callback runs request may be completed already
                resolver->add_wait_targets(1); // we send one more request so wait for it too
                future<> f = resolver->has_data() ?
-                        make_digest_requests(resolver, _targets.end() - 1, _targets.end(), timeout) :
-                        make_data_requests(resolver, _targets.end() - 1, _targets.end(), timeout);
+                        make_digest_requests(resolver, _targets.end() - 1, _targets.end()) :
+                        make_data_requests(resolver, _targets.end() - 1, _targets.end());
                f.finally([exec = shared_from_this()]{});
            }
        });
        // FIXME: the timeout should come from previous latency statistics for a partition
-        auto speculate_timeout = std::chrono::steady_clock::now() + std::chrono::milliseconds(_proxy->get_db().local().get_config().read_request_timeout_in_ms()/2);
-        _speculate_timer.arm(speculate_timeout);
+        auto timeout = std::chrono::steady_clock::now() + std::chrono::milliseconds(_proxy->get_db().local().get_config().read_request_timeout_in_ms()/2);
+        _speculate_timer.arm(timeout);

        // if CL + RR result in covering all replicas, getReadExecutor forces AlwaysSpeculating.  So we know
        // that the last replica in our list is "extra."
@@ -2034,13 +1887,13 @@ public:
            // We're hitting additional targets for read repair.  Since our "extra" replica is the least-
            // preferred by the snitch, we do an extra data read to start with against a replica more
            // likely to reply; better to let RR fail than the entire query.
-            return when_all(make_data_requests(resolver, _targets.begin(), _targets.begin() + 2, timeout),
-                            make_digest_requests(resolver, _targets.begin() + 2, _targets.end(), timeout)).discard_result();
+            return when_all(make_data_requests(resolver, _targets.begin(), _targets.begin() + 2),
+                            make_digest_requests(resolver, _targets.begin() + 2, _targets.end())).discard_result();
        } else {
            // not doing read repair; all replies are important, so it doesn't matter which nodes we
            // perform data reads against vs digest.
-            return when_all(make_data_requests(resolver, _targets.begin(), _targets.begin() + 1, timeout),
-                            make_digest_requests(resolver, _targets.begin() + 1, _targets.end() - 1, timeout)).discard_result();
+            return when_all(make_data_requests(resolver, _targets.begin(), _targets.begin() + 1),
+                            make_digest_requests(resolver, _targets.begin() + 1, _targets.end() - 1)).discard_result();
        }
    }
    virtual void got_cl() override {
@@ -2127,16 +1980,16 @@ db::read_repair_decision storage_proxy::new_read_repair_decision(const schema& s

 future<query::result_digest>
 storage_proxy::query_singular_local_digest(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const query::partition_range& pr) {
-    return query_singular_local(std::move(s), std::move(cmd), pr, query::result_request::only_digest).then([] (foreign_ptr<lw_shared_ptr<query::result>> result) {
-        return *result->digest();
+    return query_singular_local(std::move(s), std::move(cmd), pr).then([] (foreign_ptr<lw_shared_ptr<query::result>> result) {
+        return result->digest();
    });
 }

 future<foreign_ptr<lw_shared_ptr<query::result>>>
-storage_proxy::query_singular_local(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const query::partition_range& pr, query::result_request request) {
+storage_proxy::query_singular_local(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const query::partition_range& pr) {
    unsigned shard = _db.local().shard_of(pr.start()->value().token());
-    return _db.invoke_on(shard, [gs = global_schema_ptr(s), prv = std::vector<query::partition_range>({pr}) /* FIXME: pr is copied */, cmd, request] (database& db) {
-        return db.query(gs, *cmd, request, prv).then([](auto&& f) {
+    return _db.invoke_on(shard, [gs = global_schema_ptr(s), prv = std::vector<query::partition_range>({pr}) /* FIXME: pr is copied */, cmd] (database& db) {
+        return db.query(gs, *cmd, prv).then([](auto&& f) {
            return make_foreign(std::move(f));
        });
    });
@@ -2913,12 +2766,18 @@ void storage_proxy::init_messaging_service() {
        });
    });

+    ms.register_replication_finished([] (gms::inet_address from) {
+        return get_local_storage_service().confirm_replication(from);
+    });
    ms.register_get_schema_version([] (unsigned shard, table_schema_version v) {
        return get_storage_proxy().invoke_on(shard, [v] (auto&& sp) {
            logger.debug("Schema version request for {}", v);
            return local_schema_registry().get_frozen(v);
        });
    });
+    ms.register_schema_check([] {
+        return make_ready_future<utils::UUID>(get_local_storage_service().db().local().get_version());
+    });
 }

 void storage_proxy::uninit_messaging_service() {
@@ -2929,6 +2788,8 @@ void storage_proxy::uninit_messaging_service() {
    ms.unregister_read_mutation_data();
    ms.unregister_read_digest();
    ms.unregister_truncate();
+    ms.unregister_replication_finished();
+    ms.unregister_schema_check();
 }

 // Merges reconcilable_result:s from different shards into one
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -98,7 +98,6 @@ public:
        uint64_t queued_write_bytes = 0;
        uint64_t reads = 0;
        uint64_t background_reads = 0; // client no longer waits for the read
-        uint64_t read_retries = 0; // read is retried with new limit
    };
 private:
    distributed<database>& _db;
@@ -143,8 +142,7 @@ private:
    std::vector<gms::inet_address> get_live_sorted_endpoints(keyspace& ks, const dht::token& token);
    db::read_repair_decision new_read_repair_decision(const schema& s);
    ::shared_ptr<abstract_read_executor> get_read_executor(lw_shared_ptr<query::read_command> cmd, query::partition_range pr, db::consistency_level cl);
-    future<foreign_ptr<lw_shared_ptr<query::result>>> query_singular_local(schema_ptr, lw_shared_ptr<query::read_command> cmd, const query::partition_range& pr,
-                                                                           query::result_request request = query::result_request::result_and_digest);
+    future<foreign_ptr<lw_shared_ptr<query::result>>> query_singular_local(schema_ptr, lw_shared_ptr<query::read_command> cmd, const query::partition_range& pr);
    future<query::result_digest> query_singular_local_digest(schema_ptr, lw_shared_ptr<query::read_command> cmd, const query::partition_range& pr);
    future<foreign_ptr<lw_shared_ptr<query::result>>> query_partition_key_range(lw_shared_ptr<query::read_command> cmd, query::partition_range&& range, db::consistency_level cl);
    std::vector<query::partition_range> get_restricted_ranges(keyspace& ks, const schema& s, query::partition_range range);
@@ -181,8 +179,6 @@ public:
    future<> mutate_locally(const schema_ptr&, const frozen_mutation& m);
    future<> mutate_locally(std::vector<mutation> mutations);

-    future<> mutate_streaming_mutation(const schema_ptr&, const frozen_mutation& m);
-
    /**
    * Use this method to have these Mutations applied
    * across all replicas. This method will take care
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -197,7 +197,12 @@ void storage_service::prepare_to_join() {
    auto& gossiper = gms::get_local_gossiper();
    gossiper.register_(this->shared_from_this());
    auto generation_number = db::system_keyspace::increment_and_get_generation().get0();
-    gossiper.start_gossiping(generation_number, app_states).get();
+    gossiper.start_gossiping(generation_number, app_states).then([this] {
+#if SS_DEBUG
+        gms::get_local_gossiper().debug_show();
+        _token_metadata.debug_show();
+#endif
+    }).get();

    // gossip snitch infos (local DC and rack)
    gossip_snitch_info().get();
@@ -260,7 +265,7 @@ void storage_service::join_token_ring(int delay) {
        }
        set_mode(mode::JOINING, "schema complete, ready to bootstrap", true);
        set_mode(mode::JOINING, "waiting for pending range calculation", true);
-        update_pending_ranges().get();
+        block_until_update_pending_ranges_finished().get();
        set_mode(mode::JOINING, "calculation complete, ready to bootstrap", true);
        logger.debug("... got ring + schema info");

@@ -287,7 +292,7 @@ void storage_service::join_token_ring(int delay) {
                set_mode(mode::JOINING, "waiting for schema information to complete", true);
                sleep(std::chrono::seconds(1)).get();
            }
-            update_pending_ranges().get();
+            block_until_update_pending_ranges_finished().get();
        }
        logger.info("Checking bootstrapping/leaving/moving nodes: ok");

@@ -749,14 +754,14 @@ void storage_service::on_join(gms::inet_address endpoint, gms::endpoint_state ep
        on_change(endpoint, e.first, e.second);
    }
    get_local_migration_manager().schedule_schema_pull(endpoint, ep_state).handle_exception([endpoint] (auto ep) {
-        logger.warn("Fail to pull schema from {}: {}", endpoint, ep);
+        logger.warn("Fail to pull schmea from {}: {}", endpoint, ep);
    });
 }

 void storage_service::on_alive(gms::inet_address endpoint, gms::endpoint_state state) {
    logger.debug("endpoint={} on_alive", endpoint);
    get_local_migration_manager().schedule_schema_pull(endpoint, state).handle_exception([endpoint] (auto ep) {
-        logger.warn("Fail to pull schema from {}: {}", endpoint, ep);
+        logger.warn("Fail to pull schmea from {}: {}", endpoint, ep);
    });
    if (_token_metadata.is_member(endpoint)) {
 #if 0
@@ -813,7 +818,7 @@ void storage_service::on_change(inet_address endpoint, application_state state,
            do_update_system_peers_table(endpoint, state, value);
            if (state == application_state::SCHEMA) {
                get_local_migration_manager().schedule_schema_pull(endpoint, *ep_state).handle_exception([endpoint] (auto ep) {
-                    logger.warn("Failed to pull schema from {}: {}", endpoint, ep);
+                    logger.warn("Fail to pull schmea from {}: {}", endpoint, ep);
                });
            }
        }
@@ -1059,9 +1064,6 @@ future<> storage_service::drain_on_shutdown() {

 future<> storage_service::init_server(int delay) {
    return seastar::async([this, delay] {
-        get_storage_service().invoke_on_all([] (auto& ss) {
-            ss.init_messaging_service();
-        }).get();
        auto& gossiper = gms::get_local_gossiper();
 #if 0
        logger.info("Cassandra version: {}", FBUtilities.getReleaseVersionString());
@@ -1139,74 +1141,22 @@ future<> storage_service::init_server(int delay) {
    });
 }

-// should run under _replicate_task lock
-future<> storage_service::replicate_tm_only() {
-    _shadow_token_metadata = _token_metadata;
-
-    return get_storage_service().invoke_on_all([this](storage_service& local_ss){
-        if (engine().cpu_id() != 0) {
-            local_ss._token_metadata = _shadow_token_metadata;
-        }
-    });
-}
-
-// should run under _replicate_task and gossiper::timer_callback locks
-future<> storage_service::replicate_tm_and_ep_map(shared_ptr<gms::gossiper> g0) {
-    // sanity: check that gossiper is fully initialized like we expect it to be
-    return get_storage_service().invoke_on_all([](storage_service& local_ss) {
-        if (!gms::get_gossiper().local_is_initialized()) {
-            auto err = sprint("replicate_to_all_cores is called before gossiper is fully initialized");
-            logger.warn(err.c_str());
-            throw std::runtime_error(err);
-        }
-    }).then([this, g0] {
-        _shadow_token_metadata = _token_metadata;
-        g0->shadow_endpoint_state_map = g0->endpoint_state_map;
-
-        return get_storage_service().invoke_on_all([g0, this](storage_service& local_ss) {
-            if (engine().cpu_id() != 0) {
-                gms::get_local_gossiper().endpoint_state_map = g0->shadow_endpoint_state_map;
-                local_ss._token_metadata = _shadow_token_metadata;
-            }
-        });
-    });
-}
-
 future<> storage_service::replicate_to_all_cores() {
-    // sanity checks: this function is supposed to be run on shard 0 only and
-    // when gossiper has already been initialized.
    if (engine().cpu_id() != 0) {
        auto err = sprint("replicate_to_all_cores is not ran on cpu zero");
        logger.warn(err.c_str());
        throw std::runtime_error(err);
    }
-
-    if (!gms::get_gossiper().local_is_initialized()) {
-        auto err = sprint("replicate_to_all_cores is called before gossiper on shard0 is initialized");
-        logger.warn(err.c_str());
-        throw std::runtime_error(err);
-    }
-
    // FIXME: There is no back pressure. If the remote cores are slow, and
    // replication is called often, it will queue tasks to the semaphore
    // without end.
    return _replicate_task.wait().then([this] {
-
-        auto g0 = gms::get_local_gossiper().shared_from_this();
-
-        return g0->timer_callback_lock().then([this, g0] {
-            bool endpoint_map_changed = g0->shadow_endpoint_state_map != g0->endpoint_state_map;
-
-            if (endpoint_map_changed) {
-                return replicate_tm_and_ep_map(g0).finally([g0] {
-                    g0->timer_callback_unlock();
-                });
-            } else {
-                g0->timer_callback_unlock();
-                return replicate_tm_only();
+        return _the_storage_service.invoke_on_all([tm = _token_metadata] (storage_service& local_ss) {
+            if (engine().cpu_id() != 0) {
+                local_ss._token_metadata = tm;
            }
        });
-    }).then_wrapped([this, ss0 = this->shared_from_this()](auto&& f){
+    }).then_wrapped([this] (auto&& f) {
        try {
            _replicate_task.signal();
            f.get();
@@ -1229,7 +1179,6 @@ future<> storage_service::gossip_snitch_info() {
 }

 future<> storage_service::stop() {
-    uninit_messaging_service();
    return make_ready_future<>();
 }

@@ -1378,17 +1327,8 @@ future<std::map<gms::inet_address, float>> storage_service::effective_ownership(
            for (const gms::inet_address& endpoint : endpoints.second) {
                float ownership = 0.0f;
                for (range<token> r : ss.get_ranges_for_endpoint(keyspace_name, endpoint)) {
-                    // get_ranges_for_endpoint will unwrap the first range.
-                    // With t0 t1 t2 t3, the first range (t3,t0] will be splitted
-                    // as (min,t0] and (t3,max]. Skippping the range (t3,max]
-                    // we will get the correct ownership number as if the first
-                    // range were not splitted.
-                    if (!r.end()) {
-                        continue;
-                    }
-                    auto end_token = r.end()->value();
-                    if (token_ownership.find(end_token) != token_ownership.end()) {
-                        ownership += token_ownership[end_token];
+                    if (token_ownership.find(r.end().value().value()) != token_ownership.end()) {
+                        ownership += token_ownership[r.end().value().value()];
                    }
                }
                final_ownership[endpoint] = ownership;
@@ -1722,15 +1662,14 @@ future<> storage_service::start_rpc_server() {
        auto& cfg = ss._db.local().get_config();
        auto port = cfg.rpc_port();
        auto addr = cfg.rpc_address();
-        auto keepalive = cfg.rpc_keepalive();
-        return dns::gethostbyname(addr).then([&ss, tserver, addr, port, keepalive] (dns::hostent e) {
+        return dns::gethostbyname(addr).then([&ss, tserver, addr, port] (dns::hostent e) {
            auto ip = e.addresses[0].in.s_addr;
-            return tserver->start(std::ref(ss._db)).then([tserver, port, addr, ip, keepalive] {
+            return tserver->start(std::ref(ss._db)).then([tserver, port, addr, ip] {
                // #293 - do not stop anything
                //engine().at_exit([tserver] {
                //    return tserver->stop();
                //});
-                return tserver->invoke_on_all(&thrift_server::listen, ipv4_addr{ip, port}, keepalive);
+                return tserver->invoke_on_all(&thrift_server::listen, ipv4_addr{ip, port});
            });
        }).then([addr, port] {
            print("Thrift server listening on %s:%s ...\n", addr, port);
@@ -1775,11 +1714,10 @@ future<> storage_service::start_native_transport() {
        auto port = cfg.native_transport_port();
        auto addr = cfg.rpc_address();
        auto ceo = cfg.client_encryption_options();
-        auto keepalive = cfg.rpc_keepalive();
        transport::cql_load_balance lb = transport::parse_load_balance(cfg.load_balance());
-        return dns::gethostbyname(addr).then([cserver, addr, port, lb, keepalive, ceo = std::move(ceo)] (dns::hostent e) {
+        return dns::gethostbyname(addr).then([cserver, addr, port, lb, ceo = std::move(ceo)] (dns::hostent e) {
            auto ip = e.addresses[0].in.s_addr;
-            return cserver->start(std::ref(service::get_storage_proxy()), std::ref(cql3::get_query_processor()), lb).then([cserver, port, addr, ip, ceo, keepalive]() {
+            return cserver->start(std::ref(service::get_storage_proxy()), std::ref(cql3::get_query_processor()), lb).then([cserver, port, addr, ip, ceo]() {
                // #293 - do not stop anything
                //engine().at_exit([cserver] {
                //    return cserver->stop();
@@ -1794,8 +1732,8 @@ future<> storage_service::start_native_transport() {
                    cred = ::make_shared<seastar::tls::server_credentials>(::make_shared<seastar::tls::dh_params>(seastar::tls::dh_params::level::MEDIUM));
                    f = cred->set_x509_key_file(ceo.at("certificate"), ceo.at("keyfile"), seastar::tls::x509_crt_format::PEM);
                }
-                return f.then([cserver, addr, cred, keepalive] {
-                    return cserver->invoke_on_all(&transport::cql_server::listen, addr, cred, keepalive);
+                return f.then([cserver, addr, cred] {
+                    return cserver->invoke_on_all(&transport::cql_server::listen, addr, cred);
                });
            });
        }).then([addr, port] {
@@ -1855,28 +1793,27 @@ future<> storage_service::decommission() {
                }
            }

-            logger.info("DECOMMISSIONING: starts");
+            logger.debug("DECOMMISSIONING");
            ss.start_leaving().get();
            // FIXME: long timeout = Math.max(RING_DELAY, BatchlogManager.instance.getBatchlogTimeout());
            auto timeout = ss.get_ring_delay();
            ss.set_mode(mode::LEAVING, sprint("sleeping %s ms for batch processing and pending range setup", timeout.count()), true);
            sleep(timeout).get();

-            logger.info("DECOMMISSIONING: unbootstrap starts");
+            logger.debug("DECOMMISSIONING: unbootstrap starts");
            ss.unbootstrap();
-            logger.info("DECOMMISSIONING: unbootstrap done");
+            logger.debug("DECOMMISSIONING: unbootstrap done");

+            // FIXME: proper shutdown
            ss.shutdown_client_servers().get();
-            logger.info("DECOMMISSIONING: shutdown rpc and cql server done");
+            logger.debug("DECOMMISSIONING: shutdown rpc and cql server done");
            gms::get_local_gossiper().stop_gossiping().get();
-            logger.info("DECOMMISSIONING: stop_gossiping done");
+            logger.debug("DECOMMISSIONING: stop_gossiping done");
            ss.do_stop_ms().get();
-            logger.info("DECOMMISSIONING: stop messaging_service done");
            // StageManager.shutdownNow();
            db::system_keyspace::set_bootstrap_state(db::system_keyspace::bootstrap_state::DECOMMISSIONED).get();
-            logger.info("DECOMMISSIONING: set_bootstrap_state done");
+            logger.debug("DECOMMISSIONING: set_bootstrap_state done");
            ss.set_mode(mode::DECOMMISSIONED, true);
-            logger.info("DECOMMISSIONING: done");
            // let op be responsible for killing the process
        });
    });
@@ -2481,7 +2418,7 @@ void storage_service::add_expire_time_if_found(inet_address endpoint, int64_t ex
 // in there.
 future<> storage_service::load_new_sstables(sstring ks_name, sstring cf_name) {
    class max_element {
-        int64_t _result = 0;
+        int64_t _result = 1;
    public:
        future<> operator()(int64_t value) {
            _result = std::max(value, _result);
@@ -2498,8 +2435,6 @@ future<> storage_service::load_new_sstables(sstring ks_name, sstring cf_name) {
        _loading_new_sstables = true;
    }

-    logger.info("Loading new SSTables for {}.{}...", ks_name, cf_name);
-
    // First, we need to stop SSTable creation for that CF in all shards. This is a really horrible
    // thing to do, because under normal circumnstances this can make dirty memory go up to the point
    // of explosion.
@@ -2514,78 +2449,38 @@ future<> storage_service::load_new_sstables(sstring ks_name, sstring cf_name) {
        auto& cf = db.find_column_family(ks_name, cf_name);
        return cf.disable_sstable_write();
    }).then([this, cf_name, ks_name] (int64_t max_seen_sstable) {
+        logger.debug("Loading new sstables with generation numbers larger or equal than {}", max_seen_sstable);
        // Then, we will reshuffle the tables to make sure that the generation numbers don't go too high.
        // We will do all of it the same CPU, to make sure that we won't have two parallel shufflers stepping
        // onto each other.
-
-        class all_generations {
-            std::set<int64_t> _result;
-        public:
-            future<> operator()(std::set<int64_t> value) {
-                _result.insert(value.begin(), value.end());
-                return make_ready_future<>();
-            }
-            std::set<int64_t> get() && {
-                return _result;
-            }
-        };
-
-        // We provide to reshuffle_sstables() the generation of all existing sstables, such that it will
-        // easily know which sstables are new.
-        return _db.map_reduce(all_generations(), [ks_name, cf_name] (database& db) {
+        //
+        // Note that this will reshuffle all tables, including existing ones. Figuring out which of the tables
+        // are new would require coordination between all shards, so it is simpler this way. Renaming an existing
+        // SSTable shouldn't be that bad, and we are assuming empty directory for normal operation anyway.
+        auto shard = std::hash<sstring>()(cf_name) % smp::count;
+        return _db.invoke_on(shard, [ks_name, cf_name, max_seen_sstable] (database& db) {
            auto& cf = db.find_column_family(ks_name, cf_name);
-            std::set<int64_t> generations;
-            for (auto& p : *(cf.get_sstables())) {
-                generations.insert(p.second->generation());
-            }
-            return make_ready_future<std::set<int64_t>>(std::move(generations));
-        }).then([this, max_seen_sstable, ks_name, cf_name] (std::set<int64_t> all_generations) {
-            auto shard = std::hash<sstring>()(cf_name) % smp::count;
-            return _db.invoke_on(shard, [ks_name, cf_name, max_seen_sstable, all_generations = std::move(all_generations)] (database& db) {
-                auto& cf = db.find_column_family(ks_name, cf_name);
-                return cf.reshuffle_sstables(std::move(all_generations), max_seen_sstable + 1);
-            });
+            return cf.reshuffle_sstables(max_seen_sstable);
        });
-    }).then_wrapped([this, ks_name, cf_name] (future<std::vector<sstables::entry_descriptor>> f) {
-        std::vector<sstables::entry_descriptor> new_tables;
-        std::exception_ptr eptr;
-        int64_t new_gen = -1;
-
-        try {
-            new_tables = f.get0();
-        } catch(std::exception& e) {
-            logger.error("Loading of new tables failed to {}.{} due to {}", ks_name, cf_name, e.what());
-            eptr = std::current_exception();
-        } catch(...) {
-            logger.error("Loading of new tables failed to {}.{} due to unexpected reason", ks_name, cf_name);
-            eptr = std::current_exception();
-        }
-
+    }).then([this, ks_name, cf_name] (std::vector<sstables::entry_descriptor> new_tables) {
+        int64_t new_gen = 1;
        if (new_tables.size() > 0) {
            new_gen = new_tables.back().generation;
        }
-        if (new_tables.empty() && !eptr) {
-            logger.info("No new SSTables were found for {}.{}", ks_name, cf_name);
-        }

        logger.debug("Now accepting writes for sstables with generation larger or equal than {}", new_gen);
        return _db.invoke_on_all([ks_name, cf_name, new_gen] (database& db) {
            auto& cf = db.find_column_family(ks_name, cf_name);
            auto disabled = std::chrono::duration_cast<std::chrono::microseconds>(cf.enable_sstable_write(new_gen)).count();
-            logger.info("CF {}.{} at shard {} had SSTables writes disabled for {} usec", ks_name, cf_name, engine().cpu_id(), disabled);
+            logger.info("CF {} at shard {} had SSTables writes disabled for {} usec", cf_name, engine().cpu_id(), disabled);
            return make_ready_future<>();
-        }).then([new_tables = std::move(new_tables), eptr = std::move(eptr)] {
-            if (eptr) {
-                return make_exception_future<std::vector<sstables::entry_descriptor>>(eptr);
-            }
-            return make_ready_future<std::vector<sstables::entry_descriptor>>(std::move(new_tables));
+        }).then([new_tables = std::move(new_tables)] {
+            return std::move(new_tables);
        });
    }).then([this, ks_name, cf_name] (std::vector<sstables::entry_descriptor> new_tables) {
        return _db.invoke_on_all([ks_name = std::move(ks_name), cf_name = std::move(cf_name), new_tables = std::move(new_tables)] (database& db) {
            auto& cf = db.find_column_family(ks_name, cf_name);
-            return cf.load_new_sstables(new_tables).then([ks_name = std::move(ks_name), cf_name = std::move(cf_name)] {
-                logger.info("Done loading new SSTables for {}.{}", ks_name, cf_name);
-            });
+            return cf.load_new_sstables(new_tables);
        });
    }).finally([this] {
        _loading_new_sstables = false;
@@ -2863,7 +2758,7 @@ future<> storage_service::move(token new_token) {

            auto keyspaces_to_process = ss._db.local().get_non_system_keyspaces();

-            ss.update_pending_ranges().get();
+            ss.block_until_update_pending_ranges_finished().get();

            // checking if data is moving to this node
            for (auto keyspace_name : keyspaces_to_process) {
@@ -2933,6 +2828,15 @@ future<> storage_service::update_pending_ranges() {
    });
 }

+future<> storage_service::block_until_update_pending_ranges_finished() {
+    // We want to be sure the job we're blocking for is actually finished and we can't trust the TPE's active job count
+    return smp::submit_to(0, [] {
+        return do_until(
+            [] { return !(get_local_storage_service()._update_jobs > 0); },
+            [] { return sleep(std::chrono::milliseconds(100)); });
+    });
+}
+
 future<> storage_service::keyspace_changed(const sstring& ks_name) {
    // Update pending ranges since keyspace can be changed after we calculate pending ranges.
    return update_pending_ranges().handle_exception([ks_name] (auto ep) {
@@ -2940,17 +2844,5 @@ future<> storage_service::keyspace_changed(const sstring& ks_name) {
    });
 }

-void storage_service::init_messaging_service() {
-    auto& ms = net::get_local_messaging_service();
-    ms.register_replication_finished([] (gms::inet_address from) {
-        return get_local_storage_service().confirm_replication(from);
-    });
-}
-
-void storage_service::uninit_messaging_service() {
-    auto& ms = net::get_local_messaging_service();
-    ms.unregister_replication_finished();
-}
-
 } // namespace service

--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -128,15 +128,11 @@ public:

    // Needed by distributed<>
    future<> stop();
-    void init_messaging_service();
-    void uninit_messaging_service();

-private:
-    void do_update_pending_ranges();
-
-public:
    future<> keyspace_changed(const sstring& ks_name);
+    void do_update_pending_ranges();
    future<> update_pending_ranges();
+    future<> block_until_update_pending_ranges_finished();

    const locator::token_metadata& get_token_metadata() const {
        return _token_metadata;
@@ -162,7 +158,6 @@ private:
    }
    /* This abstraction maintains the token/endpoint metadata information */
    token_metadata _token_metadata;
-    token_metadata _shadow_token_metadata;
 public:
    std::chrono::milliseconds get_ring_delay();
    gms::versioned_value::factory value_factory;
@@ -737,30 +732,6 @@ private:
    future<> replicate_to_all_cores();
    semaphore _replicate_task{1};
 private:
-    /**
-     * Replicates token_metadata contents on shard0 instance to other shards.
-     *
-     * Should be called with a _replicate_task semaphore taken.
-     * Should run on shard 0 only.
-     *
-     * @return a ready future when replication is complete.
-     */
-    future<> replicate_tm_only();
-
-    /**
-     * Replicates token_metadata and gossiper::endpoint_state_map contents on
-     * shard0 instances to other shards.
-     *
-     * Should be called with a _replicate_task and a gossiper::timer_callback
-     * semaphores taken.
-     * Should run on shard 0 only.
-     *
-     * @param g0 a "shared_from_this()" pointer to a gossiper instance on shard0
-     *
-     * @return a ready future when replication is complete.
-     */
-    future<> replicate_tm_and_ep_map(shared_ptr<gms::gossiper> g0);
-
    /**
     * Handle node bootstrap
     *
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -139,7 +139,7 @@ compact_sstables(std::vector<shared_sstable> sstables, column_family& cf, std::f

    db::replay_position rp;

-    auto all_sstables = cf.get_sstables_including_compacted_undeleted();
+    auto all_sstables = cf.get_sstables();
    std::sort(sstables.begin(), sstables.end(), [] (const shared_sstable& x, const shared_sstable& y) {
        return x->generation() < y->generation();
    });
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -322,7 +322,7 @@ future<> compaction_manager::remove(column_family* cf) {
            return cf == entry;
        }),
        _cfs_to_cleanup.end());
-    _stats.pending_tasks = _cfs_to_compact.size() + _cfs_to_cleanup.size();
+    _stats.pending_tasks = _cfs_to_compact.size();
    cf->set_compaction_manager_queued(false);
    // We need to guarantee that a task being stopped will not re-queue the
    // column family being removed.
--- a/sstables/compress.cc
+++ b/sstables/compress.cc
@@ -22,9 +22,8 @@
 #include <stdexcept>
 #include <cstdlib>

-#include <seastar/core/align.hh>
-#include <seastar/core/unaligned.hh>
-#include <seastar/core/fstream.hh>
+#include "core/align.hh"
+#include "core/unaligned.hh"

 #include "compress.hh"

@@ -218,52 +217,23 @@ size_t compress_max_size_snappy(size_t input_len) {
 }

 class compressed_file_data_source_impl : public data_source_impl {
-    input_stream<char> _input_stream;
+    file _file;
    sstables::compression* _compression_metadata;
-    uint64_t _pos;
-    uint64_t _beg_pos;
-    uint64_t _end_pos;
+    uint64_t _pos = 0;
+    const io_priority_class* _pc;
 public:
-    compressed_file_data_source_impl(file f, sstables::compression* cm,
-                uint64_t pos, size_t len, file_input_stream_options options)
-            : _compression_metadata(cm)
-    {
-        _beg_pos = pos;
-        if (pos > _compression_metadata->data_len) {
-            throw std::runtime_error("attempt to uncompress beyond end");
-        }
-        if (len == 0 || pos == _compression_metadata->data_len) {
-            // Nothing to read
-            _end_pos = _pos = _beg_pos;
-            return;
-        }
-        if (len <= _compression_metadata->data_len - pos) {
-            _end_pos = pos + len;
-        } else {
-            _end_pos = _compression_metadata->data_len;
-        }
-        // _beg_pos and _end_pos specify positions in the compressed stream.
-        // We need to translate them into a range of uncompressed chunks,
-        // and open a file_input_stream to read that range.
-        auto start = _compression_metadata->locate(_beg_pos);
-        auto end = _compression_metadata->locate(_end_pos - 1);
-        _input_stream = make_file_input_stream(std::move(f),
-                start.chunk_start,
-                end.chunk_start + end.chunk_len - start.chunk_start,
-                std::move(options));
-        _pos = _beg_pos;
-    }
+    compressed_file_data_source_impl(file f, const io_priority_class& pc,
+            sstables::compression* cm, uint64_t pos)
+            : _file(std::move(f)), _compression_metadata(cm)
+            , _pos(pos)
+            , _pc(&pc)
+            {}
    virtual future<temporary_buffer<char>> get() override {
-        if (_pos >= _end_pos) {
+        if (_pos >= _compression_metadata->data_len) {
            return make_ready_future<temporary_buffer<char>>();
        }
        auto addr = _compression_metadata->locate(_pos);
-        // Uncompress the next chunk. We need to skip part of the first
-        // chunk, but then continue to read from beginning of chunks.
-        if (_pos != _beg_pos && addr.offset != 0) {
-            throw std::runtime_error("compressed reader out of sync");
-        }
-        return _input_stream.read_exactly(addr.chunk_len).
+        return _file.dma_read_exactly<char>(addr.chunk_start, addr.chunk_len, *_pc).
            then([this, addr](temporary_buffer<char> buf) {
                // The last 4 bytes of the chunk are the adler32 checksum
                // of the rest of the (compressed) chunk.
@@ -295,17 +265,16 @@ public:

 class compressed_file_data_source : public data_source {
 public:
-    compressed_file_data_source(file f, sstables::compression* cm,
-            uint64_t offset, size_t len, file_input_stream_options options)
+    compressed_file_data_source(file f, const io_priority_class& pc,
+            sstables::compression* cm, uint64_t offset)
        : data_source(std::make_unique<compressed_file_data_source_impl>(
-                std::move(f), cm, offset, len, std::move(options)))
+                std::move(f), pc, cm, offset))
        {}
 };

 input_stream<char> make_compressed_file_input_stream(
-        file f, sstables::compression* cm, uint64_t offset, size_t len,
-        file_input_stream_options options)
+        file f, sstables::compression* cm, const io_priority_class& pc, uint64_t offset)
 {
    return input_stream<char>(compressed_file_data_source(
-            std::move(f), cm, offset, len, std::move(options)));
+            std::move(f), pc, cm, offset));
 }
--- a/sstables/compress.hh
+++ b/sstables/compress.hh
@@ -202,4 +202,4 @@ public:
 // as long as we have *sstables* work in progress, we need to keep the whole
 // sstable alive, and the compression metadata is only a part of it.
 input_stream<char> make_compressed_file_input_stream(
-        file f, sstables::compression *cm, uint64_t offset, size_t len, class file_input_stream_options options);
+        file f, sstables::compression *cm, const io_priority_class& pc, uint64_t offset = 0);
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -42,16 +42,10 @@ public:
    }
 };

-// IndexConsumer is a concept that implements:
-//
-// bool should_continue();
-// void consume_entry(index_entry&& ie);
-template <class IndexConsumer>
-class index_consume_entry_context: public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
+class index_consume_entry_context: public data_consumer::continuous_data_consumer<index_consume_entry_context> {
    using proceed = data_consumer::proceed;
-    using continuous_data_consumer = data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>>;
 private:
-    IndexConsumer& _consumer;
+    index_consumer& _consumer;

    enum class state {
        START,
@@ -72,7 +66,7 @@ public:

    bool non_consuming() const {
        return ((_state == state::CONSUME_ENTRY) || (_state == state::START) ||
-                ((_state == state::PROMOTED_BYTES) && (continuous_data_consumer::_prestate == continuous_data_consumer::prestate::NONE)));
+                ((_state == state::PROMOTED_BYTES) && (_prestate == prestate::NONE)));
    }

    proceed process_state(temporary_buffer<char>& data) {
@@ -85,32 +79,32 @@ public:
            _state = state::KEY_SIZE;
            break;
        case state::KEY_SIZE:
-            if (this->read_16(data) != continuous_data_consumer::read_status::ready) {
+            if (read_16(data) != read_status::ready) {
                _state = state::KEY_BYTES;
                break;
            }
        case state::KEY_BYTES:
-            if (this->read_bytes(data, this->_u16, _key) != continuous_data_consumer::read_status::ready) {
+            if (read_bytes(data, _u16, _key) != read_status::ready) {
                _state = state::POSITION;
                break;
            }
        case state::POSITION:
-            if (this->read_64(data) != continuous_data_consumer::read_status::ready) {
+            if (read_64(data) != read_status::ready) {
                _state = state::PROMOTED_SIZE;
                break;
            }
        case state::PROMOTED_SIZE:
-            if (this->read_32(data) != continuous_data_consumer::read_status::ready) {
+            if (read_32(data) != read_status::ready) {
                _state = state::PROMOTED_BYTES;
                break;
            }
        case state::PROMOTED_BYTES:
-            if (this->read_bytes(data, this->_u32, _promoted) != continuous_data_consumer::read_status::ready) {
+            if (read_bytes(data, _u32, _promoted) != read_status::ready) {
                _state = state::CONSUME_ENTRY;
                break;
            }
        case state::CONSUME_ENTRY:
-            _consumer.consume_entry(index_entry(std::move(_key), this->_u64, std::move(_promoted)));
+            _consumer.consume_entry(index_entry(std::move(_key), _u64, std::move(_promoted)));
            _state = state::START;
            break;
        default:
@@ -119,7 +113,7 @@ public:
        return proceed::yes;
    }

-    index_consume_entry_context(IndexConsumer& consumer,
+    index_consume_entry_context(index_consumer& consumer,
            input_stream<char>&& input, uint64_t maxlen)
        : continuous_data_consumer(std::move(input), maxlen)
        , _consumer(consumer)
--- a/sstables/key.hh
+++ b/sstables/key.hh
@@ -25,6 +25,8 @@
 #include "core/future.hh"
 #include "database_fwd.hh"

+class partition_key_view;
+
 namespace sstables {

 class key_view {
@@ -57,18 +59,21 @@ enum class composite_marker : bytes::value_type {
    end_range = 1,
 };

-inline void check_marker(bytes_view component) {
+inline void check_marker(bytes_view component, composite_marker expected) {
    auto found = composite_marker(component.back());
-    switch (found) {
-    case composite_marker::none:
-    case composite_marker::start_range:
-    case composite_marker::end_range:
-        break;
-    default:
-        throw runtime_exception(sprint("Unexpected marker. Found %d, expected %d\n", uint16_t(uint8_t(found))));
+    if (found != expected) {
+        throw runtime_exception(sprint("Unexpected marker. Found %d, expected %d\n", uint8_t(found), uint8_t(expected)));
    }
 }

+inline void check_marker(bytes_view component, composite_marker expected, composite_marker alternative) {
+    auto found = composite_marker(component.back());
+    if ((found == expected) || (found == alternative)) {
+        return;
+    }
+    throw runtime_exception(sprint("Unexpected marker. Found %d, expected %d or %d\n", uint8_t(found), uint8_t(expected)));
+}
+
 // Our internal representation differs slightly (in the way it serializes) from Origin.
 // In order to be able to achieve read and write compatibility for sstables - so they can
 // be imported and exported - we need to always convert a key to this representation.
--- a/sstables/leveled_manifest.hh
+++ b/sstables/leveled_manifest.hh
@@ -735,7 +735,11 @@ public:
            if (Sets.intersection(candidates, compacting).isEmpty())
                return candidates;
 #endif
-            return candidates;
+            if (candidates.size() < 2) {
+                return {};
+            } else {
+                return candidates;
+            }
        }

        // all the sstables were suspect or overlapped with something suspect
--- a/sstables/partition.cc
+++ b/sstables/partition.cc
@@ -249,139 +249,6 @@ class mp_row_consumer : public row_consumer {
            _pending_collection = {};
        }
    }
-
-    class range_merger {
-        bytes _data;
-        bytes _end;
-        sstables::deletion_time _deletion_time;
-    public:
-        bytes&& data() {
-            return std::move(_data);
-        }
-        explicit operator bool() const noexcept {
-            return !_data.empty();
-        }
-        explicit operator sstring() const {
-            if (*this) {
-                return to_hex(_data) + sprint(" deletion (%x,%lx)", _deletion_time.local_deletion_time, _deletion_time.marked_for_delete_at);
-            } else {
-                return sstring("(null)");
-            }
-        }
-        explicit operator bytes_view() const {
-            return _data;
-        }
-
-        bool operator==(const range_merger& candidate) {
-            if (!candidate) {
-                return false;
-            }
-            bytes_view a(_data);
-            bytes_view b(candidate._data);
-            a.remove_suffix(1);
-            b.remove_suffix(1);
-            return ((a == b) && (_deletion_time == candidate._deletion_time));
-        }
-
-        bool operator!=(const range_merger& candidate) {
-            return !(*this == candidate);
-        }
-
-        bool is_prefix_of(const range_merger& candidate) {
-            bytes_view a(_data);
-            bytes_view b(candidate._data);
-            a.remove_suffix(1);
-            b.remove_suffix(1);
-            return b.compare(0, a.size(), a) == 0;
-        }
-
-        bool end_matches(bytes_view candidate, sstables::deletion_time deltime) {
-            if (_deletion_time != deltime) {
-                return false;
-            }
-            bytes_view my_end(_end);
-            my_end.remove_suffix(1);
-            candidate.remove_suffix(1);
-            return my_end == candidate;
-        }
-
-        void set_end(bytes_view end) {
-            _end = to_bytes(end);
-        }
-
-        range_merger(bytes_view start, bytes_view end, sstables::deletion_time d)
-            : _data(to_bytes(start))
-            , _end(to_bytes(end))
-            , _deletion_time(d)
-        {}
-        range_merger() : _data(), _end(), _deletion_time() {}
-    };
-
-    // Variables for tracking tombstone merging in consume_range_tombstone().
-    // All of these hold serialized composites.
-    std::stack<range_merger> _starts;
-
-    void reset_range_tombstone_merger() {
-        // Will throw if there is a current merger that hasn't finished.
-        // This will be called at the start and end of any row.
-        // This check is crucial to our goal of not falsely reporting a real range tombstone as a
-        // merger.
-        if (!_starts.empty()) {
-            auto msg = sstring("RANGE DELETE not implemented. Tried to merge, but row finished before we could finish the merge. Starts found: (");
-            while (!_starts.empty()) {
-                msg += sstring(_starts.top());
-                _starts.pop();
-                if (!_starts.empty()) {
-                    msg += sstring(" , ");
-                }
-            }
-            msg += sstring(")");
-            throw malformed_sstable_exception(msg);
-        }
-    }
-
-    bytes close_merger_range() {
-        // We closed a larger enclosing row.
-        auto ret = _starts.top().data();
-        _starts.pop();
-        return ret;
-    }
-
-    bytes update_range_tombstone_merger(bytes_view _start, bytes_view end,
-                                        sstables::deletion_time deltime) {
-        range_merger start(_start, end, deltime);
-        range_merger empty;
-
-        // If we're processing a range (_starts is not empty, it's fine to start
-        // processing another, but only so long as we're nesting. We then check
-        // to make sure that the current range being processed is a prefix of the new one.
-        if (!_starts.empty() && !_starts.top().is_prefix_of(start)) {
-            auto msg = sstring("RANGE DELETE not implemented. Tried to merge, but existing range not a prefix of new one. Current range: ");
-            msg += sstring(_starts.top());
-            msg += ". new range: " + sstring(start);
-            throw malformed_sstable_exception(msg);
-        }
-
-        range_merger& prev = empty;
-        if (!_starts.empty()) {
-            prev = _starts.top();
-        }
-        _starts.push(start);
-
-        if (prev.end_matches(bytes_view(start), deltime)) {
-            // If _contig_deletion_end, we're in the middle of trying to merge
-            // several contiguous range tombstones. If there's a gap, we cannot
-            // represent this range in Scylla.
-            prev.set_end(end);
-            // We pop what we have just inserted, because that's not starting the
-            // processing of any new range.
-            _starts.pop();
-        }
-        if (_starts.top().end_matches(end, deltime)) {
-            return close_merger_range();
-        }
-        return {};
-    }
 public:
    mutation_opt mut;

@@ -499,77 +366,39 @@ public:
        }
    }
    virtual proceed consume_row_end() override {
-        reset_range_tombstone_merger();
        if (mut) {
            flush_pending_collection(*_schema, *mut);
        }
        return proceed::no;
    }

-    // Partial support for range tombstones read from sstables:
-    //
-    // Currently, Scylla does not support generic range tombstones: Only
-    // ranges which are a complete clustering-key prefix are supported because
-    // our in-memory data structure only allows deleted rows (prefixes).
-    // In principle, this is good enough because in Cassandra 2 (whose
-    // sstables we support) and using only CQL, there is no way to delete a
-    // generic range, because the DELETE and UPDATE statement's "WHERE" only
-    // takes the "=" operator, leading to a deletion of entire rows.
-    //
-    // However, in one important case the sstable written by Cassandra does
-    // have a generic range tombstone, which we can and must handle:
-    // Consider two tombstones, one deleting a bigger prefix than the other:
-    //
-    //     create table tab (pk text, ck1 text, ck2 text, data text, primary key(pk, ck1, ck2));
-    //     delete from tab where pk = 'pk' and ck1 = 'aaa';
-    //     delete from tab where pk = 'pk' and ck1 = 'aaa' and ck2 = 'bbb';
-    //
-    // The first deletion covers the second, but nevertheless we cannot drop the
-    // smaller one because the two deletions have different timestamps.
-    // Currently in Scylla, we simply keep both tombstones separately.
-    // But Cassandra does something different: Cassandra does not want to have
-    // overlapping range tombstones, so it converts them into non-overlapping
-    // range tombstones (see RangeTombstoneList.java). In the above example,
-    // the resulting sstable is (sstable2json format)
-    //
-    //     {"key": "pk",
-    //      "cells": [["aaa:_","aaa:bbb:_",1459334681228103,"t",1459334681],
-    //                ["aaa:bbb:_","aaa:bbb:!",1459334681244989,"t",1459334681],
-    //                ["aaa:bbb:!","aaa:!",1459334681228103,"t",1459334681]]}
-    //               ]
-    //
-    // In this sstable, the first and third tombstones look like "generic" ranges,
-    // not covering an entire prefix, so we cannot represent these three
-    // tombstones in our in-memory data structure. Instead, we need to convert the
-    // three non-overlapping tombstones to two overlapping whole-prefix tombstones,
-    // the two we started with in the "delete" commands above.
-    // This is what the code below does. If after trying to recombine split
-    // tombstones we are still left with a generic range we cannot represent,
-    // we fail the read.
-
    virtual void consume_range_tombstone(
            bytes_view start_col, bytes_view end_col,
            sstables::deletion_time deltime) override {
-        // We used to check that start_col has composite_marker:start_range
-        // and end_col has composite_marker::end_range. But this check is
-        // incorrect. start_col may have composite_marker::none in sstables
-        // from older versions of Cassandra (see CASSANDRA-7593) and we also
-        // saw composite_marker::none in end_col. Also, when a larger range
-        // tombstone was split (see explanation above), we can have a
-        // start_range in end_col or end_range in start_col.
-        // So we don't check the markers' content at all here, only if they
-        // are sane.
-        check_marker(start_col);
-        check_marker(end_col);
+        check_marker(end_col, composite_marker::end_range);
+        // Some versions of Cassandra will write a 0 to mark the start of the range.
+        // CASSANDRA-7593 discusses that.
+        check_marker(start_col, composite_marker::start_range, composite_marker::none);

-        bytes new_start = {};
-        new_start = update_range_tombstone_merger(start_col, end_col, deltime);
-        if (new_start.empty()) {
-            return;
+        // FIXME: CASSANDRA-6237 says support will be added to things like this.
+        //
+        // The check below represents a range with a different start and end
+        // clustering key.  Cassandra-generated files (to the moment) will
+        // generate multi-row deletes, but they always have the same clustering
+        // key. This is basically because one can't (yet) write delete
+        // statements in which the WHERE clause looks like WHERE clustering_key >= x.
+        //
+        // We don't really have it in our model ATM, so let's just mark this unimplemented.
+        //
+        // The only expected difference between them, is the final marker. We
+        // will remove it from end_col to ease the comparison, but will leave
+        // start_col untouched to make sure explode() still works.
+        end_col.remove_suffix(1);
+        if (start_col.compare(0, end_col.size(), end_col)) {
+            fail(unimplemented::cause::RANGE_DELETES);
        }
-        start_col = bytes_view(new_start);
-        auto start = composite_view(column::fix_static_name(start_col)).explode();

+        auto start = composite_view(column::fix_static_name(start_col)).explode();
        // Note how this is slightly different from the check in is_collection. Collection tombstones
        // do not have extra data.
        //
@@ -637,13 +466,6 @@ sstables::sstable::read_row(schema_ptr schema, const sstables::key& key, const i
    auto token = partitioner.get_token(key_view(key));

    auto& summary = _summary;
-
-    if (token < partitioner.get_token(key_view(summary.first_key.value))
-            || token > partitioner.get_token(key_view(summary.last_key.value))) {
-        _filter_tracker.add_false_positive();
-        return make_ready_future<mutation_opt>();
-    }
-
    auto summary_idx = adjust_binary_search_index(binary_search(summary.entries, key, token));
    if (summary_idx < 0) {
        _filter_tracker.add_false_positive();
--- a/sstables/row.cc
+++ b/sstables/row.cc
@@ -361,13 +361,9 @@ future<> data_consume_context::read() {

 data_consume_context sstable::data_consume_rows(
        row_consumer& consumer, uint64_t start, uint64_t end) {
-    // TODO: The second "end - start" below is redundant: The first one tells
-    // data_stream() to stop at the "end" byte, which allows optimal read-
-    // ahead and avoiding over-read at the end. The second one tells the
-    // consumer to stop at exactly the same place, and forces the consumer
-    // to maintain its own byte count.
+    auto estimated_size = std::min(uint64_t(sstable_buffer_size), align_up(end - start, uint64_t(8 << 10)));
    return std::make_unique<data_consume_context::impl>(
-            consumer, data_stream(start, end - start, consumer.io_priority()), end - start);
+            consumer, data_stream_at(start, std::max<size_t>(estimated_size, 8192), consumer.io_priority()), end - start);
 }

 data_consume_context sstable::data_consume_rows(row_consumer& consumer) {
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -30,7 +30,6 @@
 #include "core/shared_ptr.hh"
 #include "core/do_with.hh"
 #include "core/thread.hh"
-#include <seastar/core/shared_future.hh>
 #include <iterator>

 #include "types.hh"
@@ -45,9 +44,6 @@
 #include <boost/filesystem/operations.hpp>
 #include <boost/algorithm/string.hpp>
 #include <boost/range/adaptor/map.hpp>
-#include <boost/range/algorithm_ext/insert.hpp>
-#include <boost/range/algorithm_ext/push_back.hpp>
-#include <boost/range/algorithm/set_algorithm.hpp>
 #include <regex>
 #include <core/align.hh>
 #include "utils/phased_barrier.hh"
@@ -63,12 +59,7 @@ future<file> new_sstable_component_file(sstring name, open_flags flags) {
    });
 }

-future<file> new_sstable_component_file(sstring name, open_flags flags, file_open_options options) {
-    return open_file_dma(name, flags, options).handle_exception([name] (auto ep) {
-        sstlog.error("Could not create SSTable component {}. Found exception: {}", name, ep);
-        return make_exception_future<file>(ep);
-    });
-}
+thread_local std::unordered_map<sstring, std::unordered_set<unsigned>> sstable::_shards_agreeing_to_remove_sstable;

 static utils::phased_barrier& background_jobs() {
    static thread_local utils::phased_barrier gate;
@@ -691,10 +682,6 @@ inline void write(file_writer& out, estimated_histogram& eh) {
 // This is small enough, and well-defined. Easier to just read it all
 // at once
 future<> sstable::read_toc() {
-    if (_components.size()) {
-        return make_ready_future<>();
-    }
-
    auto file_path = filename(sstable::component_type::TOC);

    sstlog.debug("Reading TOC file {} ", file_path);
@@ -725,7 +712,6 @@ future<> sstable::read_toc() {
                try {
                   _components.insert(reverse_map(c, _component_map));
                } catch (std::out_of_range& oor) {
-                    _components.clear(); // so subsequent read_toc will be forced to fail again
                    throw malformed_sstable_exception("Unrecognized TOC component: " + c);
                }
            }
@@ -862,21 +848,22 @@ future<index_list> sstable::read_indexes(uint64_t summary_idx, const io_priority
    uint64_t quantity = downsampling::get_effective_index_interval_after_index(summary_idx, _summary.header.sampling_level,
        _summary.header.min_index_interval);

-    uint64_t end;
+    uint64_t estimated_size;
    if (++summary_idx >= _summary.header.size) {
-        end = index_size();
+        estimated_size = index_size() - position;
    } else {
-        end = _summary.entries[summary_idx].position;
+        estimated_size = _summary.entries[summary_idx].position - position;
    }

-    return do_with(index_consumer(quantity), [this, position, end, &pc] (index_consumer& ic) {
+    estimated_size = std::min(uint64_t(sstable_buffer_size), align_up(estimated_size, uint64_t(8 << 10)));
+    estimated_size = std::max<size_t>(estimated_size, 8192);
+
+    return do_with(index_consumer(quantity), [this, position, estimated_size, &pc] (index_consumer& ic) {
        file_input_stream_options options;
-        options.buffer_size = sstable_buffer_size;
+        options.buffer_size = estimated_size;
        options.io_priority_class = pc;
-        auto stream = make_file_input_stream(this->_index_file, position, end - position, std::move(options));
-        // TODO: it's redundant to constrain the consumer here to stop at
-        // index_size()-position, the input stream is already constrained.
-        auto ctx = make_lw_shared<index_consume_entry_context<index_consumer>>(ic, std::move(stream), this->index_size() - position);
+        auto stream = make_file_input_stream(this->_index_file, position, std::move(options));
+        auto ctx = make_lw_shared<index_consume_entry_context>(ic, std::move(stream), this->index_size() - position);
        return ctx->consume_input(*ctx).then([ctx, &ic] {
            return make_ready_future<index_list>(std::move(ic.indexes));
        });
@@ -948,25 +935,6 @@ void sstable::write_statistics(const io_priority_class& pc) {
    write_simple<component_type::Statistics>(_statistics, pc);
 }

-future<> sstable::read_summary(const io_priority_class& pc) {
-    if (_summary) {
-        return make_ready_future<>();
-    }
-
-    return read_toc().then([this, &pc] {
-        // We'll try to keep the main code path exception free, but if an exception does happen
-        // we can try to regenerate the Summary.
-        if (has_component(sstable::component_type::Summary)) {
-            return read_simple<component_type::Summary>(_summary, pc).handle_exception([this, &pc] (auto ep) {
-                sstlog.warn("Couldn't read summary file %s: %s. Recreating it.", this->filename(component_type::Summary), ep);
-                return this->generate_summary(pc);
-            });
-        } else {
-            return generate_summary(pc);
-        }
-    });
-}
-
 future<> sstable::open_data() {
    return when_all(open_file_dma(filename(component_type::Index), open_flags::ro),
                    open_file_dma(filename(component_type::Data), open_flags::ro)).then([this] (auto files) {
@@ -997,10 +965,8 @@ future<> sstable::open_data() {

 future<> sstable::create_data() {
    auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
-    file_open_options opt;
-    opt.extent_allocation_size_hint = 32 << 20;
    return when_all(new_sstable_component_file(filename(component_type::Index), oflags),
-                    new_sstable_component_file(filename(component_type::Data), oflags, opt)).then([this] (auto files) {
+                    new_sstable_component_file(filename(component_type::Data), oflags)).then([this] (auto files) {
        // FIXME: If both files could not be created, the first get below will
        // throw an exception, and second get() will not be attempted, and
        // we'll get a warning about the second future being destructed
@@ -1074,7 +1040,7 @@ static inline void update_cell_stats(column_stats& c_stats, uint64_t timestamp)

 // Intended to write all cell components that follow column name.
 void sstable::write_cell(file_writer& out, atomic_cell_view cell) {
-    // FIXME: counter cell isn't supported yet.
+    // FIXME: range tombstone and counter cells aren't supported yet.

    uint64_t timestamp = cell.timestamp();

@@ -1237,9 +1203,10 @@ static void write_index_entry(file_writer& out, disk_string_view<uint16_t>& key,
    write(out, key, pos, promoted_index_size);
 }

-static void prepare_summary(summary& s, uint64_t expected_partition_count, uint32_t min_index_interval) {
+static void prepare_summary(summary& s, uint64_t expected_partition_count, const schema& schema) {
    assert(expected_partition_count >= 1);

+    auto min_index_interval = schema.min_index_interval();
    s.header.min_index_interval = min_index_interval;
    s.header.sampling_level = downsampling::BASE_SAMPLING_LEVEL;
    uint64_t max_expected_entries =
@@ -1256,7 +1223,8 @@ static void prepare_summary(summary& s, uint64_t expected_partition_count, uint3

 static void seal_summary(summary& s,
        std::experimental::optional<key>&& first_key,
-        std::experimental::optional<key>&& last_key) {
+        std::experimental::optional<key>&& last_key,
+        const schema& schema) {
    s.header.size = s.entries.size();
    s.header.size_at_full_sampling = s.header.size;

@@ -1345,7 +1313,7 @@ void sstable::do_write_components(::mutation_reader mr,
    auto filter_fp_chance = schema->bloom_filter_fp_chance();
    _filter = utils::i_filter::get_filter(estimated_partitions, filter_fp_chance);

-    prepare_summary(_summary, estimated_partitions, schema->min_index_interval());
+    prepare_summary(_summary, estimated_partitions, *schema);

    // FIXME: we may need to set repaired_at stats at this point.

@@ -1425,7 +1393,7 @@ void sstable::do_write_components(::mutation_reader mr,
        }

    }
-    seal_summary(_summary, std::move(first_key), std::move(last_key));
+    seal_summary(_summary, std::move(first_key), std::move(last_key), *schema);

    index->close().get();
    _index_file = file(); // index->close() closed _index_file
@@ -1498,60 +1466,6 @@ future<> sstable::write_components(::mutation_reader mr,
    });
 }

-future<> sstable::generate_summary(const io_priority_class& pc) {
-    if (_summary) {
-        return make_ready_future<>();
-    }
-
-    sstlog.info("Summary file {} not found. Generating Summary...", filename(sstable::component_type::Summary));
-    class summary_generator {
-        summary& _summary;
-    public:
-        std::experimental::optional<key> first_key, last_key;
-
-        summary_generator(summary& s) : _summary(s) {}
-        bool should_continue() {
-            return true;
-        }
-        void consume_entry(index_entry&& ie) {
-            maybe_add_summary_entry(_summary, ie.get_key_bytes(), ie.position());
-            if (!first_key) {
-                first_key = key(to_bytes(ie.get_key_bytes()));
-            } else {
-                last_key = key(to_bytes(ie.get_key_bytes()));
-            }
-        }
-    };
-
-    return open_file_dma(filename(component_type::Index), open_flags::ro).then([this, &pc] (file index_file) {
-        return do_with(std::move(index_file), [this, &pc] (file index_file) {
-            return index_file.size().then([this, &pc, index_file] (auto size) {
-                // an upper bound. Surely to be less than this.
-                auto estimated_partitions = size / sizeof(uint64_t);
-                // Since we don't have a summary, use a default min_index_interval, and if needed we'll resample
-                // later.
-                prepare_summary(_summary, estimated_partitions, 0x80);
-
-                file_input_stream_options options;
-                options.buffer_size = sstable_buffer_size;
-                options.io_priority_class = pc;
-                auto stream = make_file_input_stream(index_file, 0, size, std::move(options));
-                return do_with(summary_generator(_summary), [this, &pc, stream = std::move(stream), size] (summary_generator& s) mutable {
-                    auto ctx = make_lw_shared<index_consume_entry_context<summary_generator>>(s, std::move(stream), size);
-                    return ctx->consume_input(*ctx).then([this, ctx, &s] {
-                        seal_summary(_summary, std::move(s.first_key), std::move(s.last_key));
-                    });
-                });
-            }).then([index_file] () mutable {
-                return index_file.close().handle_exception([] (auto ep) {
-                    sstlog.warn("sstable close index_file failed: {}", ep);
-                    return make_exception_future<>(std::move(ep));
-                });
-            });
-        });
-    });
-}
-
 uint64_t sstable::data_size() const {
    if (has_component(sstable::component_type::CompressionInfo)) {
        return _compression.data_len;
@@ -1692,36 +1606,25 @@ sstable::component_type sstable::component_from_sstring(sstring &s) {
    return reverse_map(s, _component_map);
 }

-// NOTE: Prefer using data_stream() if you know the byte position at which the
-// read will stop. Knowing the end allows data_stream() to use a large a read-
-// ahead buffer before reaching the end, but not over-read at the end, so
-// data_stream() is more efficient than data_stream_at().
 input_stream<char> sstable::data_stream_at(uint64_t pos, uint64_t buf_size, const io_priority_class& pc) {
-    file_input_stream_options options;
-    options.buffer_size = buf_size;
-    options.io_priority_class = pc;
    if (_compression) {
-        return make_compressed_file_input_stream(_data_file, &_compression,
-                pos, _compression.data_len - pos, std::move(options));
+        return make_compressed_file_input_stream(
+                _data_file, &_compression, pc, pos);
    } else {
+        file_input_stream_options options;
+        options.buffer_size = buf_size;
+        options.io_priority_class = pc;
        return make_file_input_stream(_data_file, pos, std::move(options));
    }
 }

-input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_priority_class& pc) {
-    file_input_stream_options options;
-    options.buffer_size = sstable_buffer_size;
-    options.io_priority_class = pc;
-    if (_compression) {
-        return make_compressed_file_input_stream(_data_file, &_compression,
-                pos, len, std::move(options));
-    } else {
-        return make_file_input_stream(_data_file, pos, len, std::move(options));
-    }
-}
-
+// FIXME: to read a specific byte range, we shouldn't use the input stream
+// interface - it may cause too much read when we intend to read a small
+// range, and too small reads, and repeated waits, when reading a large range
+// which we should have started at once.
 future<temporary_buffer<char>> sstable::data_read(uint64_t pos, size_t len, const io_priority_class& pc) {
-    return do_with(data_stream(pos, len, pc), [len] (auto& stream) {
+    auto estimated_size = std::min(uint64_t(sstable_buffer_size), align_up(len, uint64_t(8 << 10)));
+    return do_with(data_stream_at(pos, estimated_size, pc), [len] (auto& stream) {
        return stream.read_exactly(len);
    });
 }
@@ -1817,7 +1720,7 @@ sstable::~sstable() {
        // clean up unused sstables, and because we'll never reuse the same
        // generation number anyway.
        try {
-            delete_atomically({sstable_to_delete(filename(component_type::TOC), _shared)}).handle_exception(
+            shared_remove_by_toc_name(filename(component_type::TOC), _shared).handle_exception(
                        [op = background_jobs().start()] (std::exception_ptr eptr) {
                            sstlog.warn("Exception when deleting sstable file: {}", eptr);
                        });
@@ -1833,6 +1736,26 @@ dirname(sstring fname) {
    return boost::filesystem::canonical(std::string(fname)).parent_path().string();
 }

+future<>
+sstable::shared_remove_by_toc_name(sstring toc_name, bool shared) {
+    if (!shared) {
+        return remove_by_toc_name(toc_name);
+    } else {
+        auto shard = std::hash<sstring>()(toc_name) % smp::count;
+        return smp::submit_to(shard, [toc_name, src_shard = engine().cpu_id()] {
+            auto& remove_set = _shards_agreeing_to_remove_sstable[toc_name];
+            remove_set.insert(src_shard);
+            auto counter = remove_set.size();
+            if (counter == smp::count) {
+                _shards_agreeing_to_remove_sstable.erase(toc_name);
+                return remove_by_toc_name(toc_name);
+            } else {
+                return make_ready_future<>();
+            }
+        });
+    }
+}
+
 future<>
 fsync_directory(sstring fname) {
    return open_directory(dirname(fname)).then([] (file f) {
@@ -1845,23 +1768,16 @@ fsync_directory(sstring fname) {
 future<>
 remove_by_toc_name(sstring sstable_toc_name) {
    return seastar::async([sstable_toc_name] {
-        sstring prefix = sstable_toc_name.substr(0, sstable_toc_name.size() - TOC_SUFFIX.size());
-        auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
-        sstring dir;
-
-        if (file_exists(sstable_toc_name).get0()) {
-            dir = dirname(sstable_toc_name);
-            rename_file(sstable_toc_name, new_toc_name).get();
-            fsync_directory(dir).get();
-        } else {
-            dir = dirname(new_toc_name);
-        }
-
-        auto toc_file = open_file_dma(new_toc_name, open_flags::ro).get0();
+        auto dir = dirname(sstable_toc_name);
+        auto toc_file = open_file_dma(sstable_toc_name, open_flags::ro).get0();
        auto in = make_file_input_stream(toc_file);
        auto size = toc_file.size().get0();
        auto text = in.read_exactly(size).get0();
        in.close().get();
+        sstring prefix = sstable_toc_name.substr(0, sstable_toc_name.size() - TOC_SUFFIX.size());
+        auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
+        rename_file(sstable_toc_name, new_toc_name).get();
+        fsync_directory(dir).get();
        std::vector<sstring> components;
        sstring all(text.begin(), text.end());
        boost::split(components, all, boost::is_any_of("\n"));
@@ -1874,58 +1790,13 @@ remove_by_toc_name(sstring sstable_toc_name) {
                // already deleted
                return make_ready_future<>();
            }
-            auto fname = prefix + component;
-            return remove_file(prefix + component).then_wrapped([fname = std::move(fname)] (future<> f) {
-                // forgive ENOENT, since the component may not have been written;
-                try {
-                    f.get();
-                } catch (std::system_error& e) {
-                    if (e.code() != std::error_code(ENOENT, std::system_category())) {
-                        throw;
-                    }
-                    sstlog.debug("Forgiving ENOENT when deleting file {}", fname);
-                }
-                return make_ready_future<>();
-            });
+            return remove_file(prefix + component);
        }).get();
        fsync_directory(dir).get();
        remove_file(new_toc_name).get();
    });
 }

-future<>
-sstable::mark_for_deletion_on_disk() {
-    mark_for_deletion();
-
-    auto toc_name = filename(component_type::TOC);
-    auto shard = std::hash<sstring>()(toc_name) % smp::count;
-
-    return smp::submit_to(shard, [toc_name] {
-        static thread_local std::unordered_set<sstring> renaming;
-
-        if (renaming.count(toc_name) > 0) {
-            return make_ready_future<>();
-        }
-
-        renaming.emplace(toc_name);
-
-        return seastar::async([toc_name] {
-            if (!file_exists(toc_name).get0()) {
-                return; // already gone
-            }
-
-            auto dir = dirname(toc_name);
-            auto toc_file = open_file_dma(toc_name, open_flags::ro).get0();
-            sstring prefix = toc_name.substr(0, toc_name.size() - TOC_SUFFIX.size());
-            auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
-            rename_file(toc_name, new_toc_name).get();
-            fsync_directory(dir).get();
-        }).finally([toc_name] {
-            renaming.erase(toc_name);
-        });
-    });
-}
-
 future<>
 sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
    return seastar::async([ks, cf, dir, generation, v, f] {
@@ -1968,11 +1839,12 @@ sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64
 }

 future<range<partition_key>>
-sstable::get_sstable_key_range(const schema& s) {
-    auto fut = read_summary(default_priority_class());
-    return std::move(fut).then([this, &s] () mutable {
-        auto first = get_first_partition_key(s);
-        auto last = get_last_partition_key(s);
+sstable::get_sstable_key_range(const schema& s, sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
+    auto sst = std::make_unique<sstable>(ks, cf, dir, generation, v, f);
+    auto fut = sst->read_summary(default_priority_class());
+    return std::move(fut).then([sst = std::move(sst), &s] () mutable {
+        auto first = sst->get_first_partition_key(s);
+        auto last = sst->get_last_partition_key(s);
        return make_ready_future<range<partition_key>>(range<partition_key>::make(first, last));
    });
 }
@@ -1982,170 +1854,4 @@ void sstable::mark_sstable_for_deletion(sstring ks, sstring cf, sstring dir, int
    sst.mark_for_deletion();
 }

-std::ostream&
-operator<<(std::ostream& os, const sstable_to_delete& std) {
-    return os << std.name << "(" << (std.shared ? "shared" : "unshared") << ")";
-}
-
-using shards_agreeing_to_delete_sstable_type = std::unordered_set<shard_id>;
-using sstables_to_delete_atomically_type = std::set<sstring>;
-struct pending_deletion {
-    sstables_to_delete_atomically_type names;
-    std::vector<lw_shared_ptr<promise<>>> completions;
-};
-
-static thread_local bool g_atomic_deletions_cancelled = false;
-static thread_local std::list<lw_shared_ptr<pending_deletion>> g_atomic_deletion_sets;
-static thread_local std::unordered_map<sstring, shards_agreeing_to_delete_sstable_type> g_shards_agreeing_to_delete_sstable;
-
-static logging::logger deletion_logger("sstable-deletion");
-
-static
-future<>
-do_delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set, unsigned deleting_shard) {
-    // runs on shard 0 only
-    deletion_logger.debug("shard {} atomically deleting {}", deleting_shard, atomic_deletion_set);
-
-    if (g_atomic_deletions_cancelled) {
-        deletion_logger.debug("atomic deletions disabled, erroring out");
-        throw std::runtime_error(sprint("atomic deletions disabled; not deleting %s", atomic_deletion_set));
-    }
-
-    // Insert atomic_deletion_set into the list of sets pending deletion.  If the new set
-    // overlaps with an existing set, merge them (the merged set will be deleted atomically).
-    std::list<lw_shared_ptr<pending_deletion>> new_atomic_deletion_sets;
-    auto merged_set = make_lw_shared(pending_deletion());
-    for (auto&& sst_to_delete : atomic_deletion_set) {
-        merged_set->names.insert(sst_to_delete.name);
-        if (!sst_to_delete.shared) {
-            for (auto shard : boost::irange<shard_id>(0, smp::count)) {
-                g_shards_agreeing_to_delete_sstable[sst_to_delete.name].insert(shard);
-            }
-        }
-    }
-    merged_set->completions.push_back(make_lw_shared<promise<>>());
-    auto ret = merged_set->completions.back()->get_future();
-    for (auto&& old_set : g_atomic_deletion_sets) {
-         auto intersection = sstables_to_delete_atomically_type();
-         boost::set_intersection(merged_set->names, old_set->names, std::inserter(intersection, intersection.end()));
-         if (intersection.empty()) {
-             // We copy old_set to avoid corrupting g_atomic_deletion_sets if we fail
-             // further on.
-             new_atomic_deletion_sets.push_back(old_set);
-         } else {
-             deletion_logger.debug("merging with {}", old_set->names);
-             merged_set->names.insert(old_set->names.begin(), old_set->names.end());
-             boost::push_back(merged_set->completions, old_set->completions);
-         }
-    }
-    deletion_logger.debug("new atomic set: {}", merged_set->names);
-    new_atomic_deletion_sets.push_back(merged_set);
-    // can now exception-safely commit:
-    g_atomic_deletion_sets = std::move(new_atomic_deletion_sets);
-
-    // Mark each sstable as being deleted from deleting_shard.  We have to do
-    // this in a separate pass, so the consideration whether we can delete or not
-    // sees all the data from this pass.
-    for (auto&& sst : atomic_deletion_set) {
-        g_shards_agreeing_to_delete_sstable[sst.name].insert(deleting_shard);
-    }
-
-    // Figure out if the (possibly merged) set can be deleted
-    for (auto&& sst : merged_set->names) {
-        if (g_shards_agreeing_to_delete_sstable[sst].size() != smp::count) {
-            // Not everyone agrees, leave the set pending
-            deletion_logger.debug("deferring deletion until all shards agree");
-            return ret;
-        }
-    }
-
-    // Cannot recover from a failed deletion
-    g_atomic_deletion_sets.pop_back();
-    for (auto&& name : merged_set->names) {
-        g_shards_agreeing_to_delete_sstable.erase(name);
-    }
-
-    // Everyone agrees, let's delete
-    // FIXME: this needs to be done atomically (using a log file of sstables we intend to delete)
-    parallel_for_each(merged_set->names, [] (sstring name) {
-        deletion_logger.debug("deleting {}", name);
-        return remove_by_toc_name(name);
-    }).then_wrapped([merged_set] (future<> result) {
-        deletion_logger.debug("atomic deletion completed: {}", merged_set->names);
-        shared_future<> sf(std::move(result));
-        for (auto&& comp : merged_set->completions) {
-            sf.get_future().forward_to(std::move(*comp));
-        }
-    });
-
-    return ret;
-}
-
-struct pending_shard_deletes {
-    std::unordered_map<int, promise<>> pending_deletes;
-    int idgen = 0;
-    future<> delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set);
-    void acknowledge(int id, std::exception_ptr ex);
-};
-
-static thread_local pending_shard_deletes this_shard_deletes;
-
-future<>
-pending_shard_deletes::delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set) {
-    auto i = pending_deletes.emplace(idgen++, promise<>()).first;
-    auto idx = i->first;
-    auto fut = i->second.get_future();
-    auto deleting_shard = engine().cpu_id();
-    smp::submit_to(0, [atomic_deletion_set, deleting_shard, idx] {
-        futurize<void>::apply(do_delete_atomically, atomic_deletion_set, deleting_shard).then_wrapped([deleting_shard, idx] (future<> ret) {
-            std::exception_ptr ex;
-            if (ret.failed()) {
-                ex = ret.get_exception();
-            }
-            return smp::submit_to(deleting_shard, [idx, ex] () mutable {
-                this_shard_deletes.acknowledge(idx, ex);
-            });
-        });
-    });
-    return fut;
-}
-
-void
-pending_shard_deletes::acknowledge(int idx, std::exception_ptr ex) {
-    auto i = pending_deletes.find(idx);
-    auto& pr = i->second;
-    if (ex) {
-        pr.set_exception(ex);
-    } else {
-        pr.set_value();
-    }
-    pending_deletes.erase(i);
-}
-
-future<>
-delete_atomically(std::vector<sstable_to_delete> ssts) {
-    return this_shard_deletes.delete_atomically(std::move(ssts));
-}
-
-future<>
-delete_atomically(std::vector<shared_sstable> ssts) {
-    std::vector<sstable_to_delete> sstables_to_delete_atomically;
-    for (auto&& sst : ssts) {
-        sstables_to_delete_atomically.push_back({sst->toc_filename(), sst->is_shared()});
-    }
-    return delete_atomically(std::move(sstables_to_delete_atomically));
-}
-
-void
-cancel_atomic_deletions() {
-    g_atomic_deletions_cancelled = true;
-    for (auto&& pd : g_atomic_deletion_sets) {
-        for (auto&& c : pd->completions) {
-            c->set_exception(std::runtime_error(sprint("Atomic sstable deletions cancelled; not deleting %s", pd->names)));
-        }
-    }
-    g_atomic_deletion_sets.clear();
-    g_shards_agreeing_to_delete_sstable.clear();
-}
-
 }
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -245,8 +245,6 @@ public:
        _marked_for_deletion = true;
    }

-    future<> mark_for_deletion_on_disk();
-
    bool marked_for_deletion() const {
        return _marked_for_deletion;
    }
@@ -341,9 +339,11 @@ private:
    void prepare_write_components(::mutation_reader mr,
            uint64_t estimated_partitions, schema_ptr schema, uint64_t max_sstable_size,
            const io_priority_class& pc);
+    static future<> shared_remove_by_toc_name(sstring toc_name, bool shared);
    static std::unordered_map<version_types, sstring, enum_hash<version_types>> _version_string;
    static std::unordered_map<format_types, sstring, enum_hash<format_types>> _format_string;
    static std::unordered_map<component_type, sstring, enum_hash<component_type>> _component_map;
+    static thread_local std::unordered_map<sstring, std::unordered_set<unsigned>> _shards_agreeing_to_remove_sstable;

    std::unordered_set<component_type, enum_hash<component_type>> _components;

@@ -397,16 +397,13 @@ private:

    void write_filter(const io_priority_class& pc);

-    future<> read_summary(const io_priority_class& pc);
-
+    future<> read_summary(const io_priority_class& pc) {
+        return read_simple<component_type::Summary>(_summary, pc);
+    }
    void write_summary(const io_priority_class& pc) {
        write_simple<component_type::Summary>(_summary, pc);
    }

-    // To be called when we try to load an SSTable that lacks a Summary. Could
-    // happen if old tools are being used.
-    future<> generate_summary(const io_priority_class& pc);
-
    future<> read_statistics(const io_priority_class& pc);
    void write_statistics(const io_priority_class& pc);

@@ -416,16 +413,6 @@ private:

    input_stream<char> data_stream_at(uint64_t pos, uint64_t buf_size, const io_priority_class& pc);

-    // Return an input_stream which reads exactly the specified byte range
-    // from the data file (after uncompression, if the file is compressed).
-    // Unlike data_read() below, this method does not read the entire byte
-    // range into memory all at once. Rather, this method allows reading the
-    // data incrementally as a stream. Knowing in advance the exact amount
-    // of bytes to be read using this stream, we can make better choices
-    // about the buffer size to read, and where exactly to stop reading
-    // (even when a large buffer size is used).
-    input_stream<char> data_stream(uint64_t pos, size_t len, const io_priority_class& pc);
-
    // Read exactly the specific byte range from the data file (after
    // uncompression, if the file is compressed). This can be used to read
    // a specific row from the data file (its position and length can be
@@ -536,8 +523,8 @@ public:
    }

    // Return sstable key range as range<partition_key> reading only the summary component.
-    future<range<partition_key>>
-    get_sstable_key_range(const schema& s);
+    static future<range<partition_key>>
+    get_sstable_key_range(const schema& s, sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f);

    // Used to mark a sstable for deletion that is not relevant to the current shard.
    // It doesn't mean that the sstable will be deleted, but that the sstable is not
@@ -584,31 +571,4 @@ future<> await_background_jobs();
 // Invokes await_background_jobs() on all shards
 future<> await_background_jobs_on_all_shards();

-struct sstable_to_delete {
-    sstable_to_delete(sstring name, bool shared) : name(std::move(name)), shared(shared) {}
-    sstring name;
-    bool shared = false;
-    friend std::ostream& operator<<(std::ostream& os, const sstable_to_delete& std);
-};
-
-
-// When we compact sstables, we have to atomically instantiate the new
-// sstable and delete the old ones.  Otherwise, if we compact A+B into C,
-// and if A contained some data that was tombstoned by B, and if B was
-// deleted but A survived, then data from A will be resurrected.
-//
-// There are two violators of the requirement to atomically delete
-// sstables: first sstable instantiation and deletion on disk is atomic
-// only wrt. itself, not other sstables, and second when an sstable is
-// shared among shard, so actual on-disk deletion of an sstable is deferred
-// until all shards agree it can be deleted.
-//
-// This function only solves the second problem for now.
-future<> delete_atomically(std::vector<shared_sstable> ssts);
-future<> delete_atomically(std::vector<sstable_to_delete> ssts);
-
-// Cancel any deletions scheduled by delete_atomically() and make their
-// futures complete
-void cancel_atomic_deletions();
-
 }
--- a/sstables/types.hh
+++ b/sstables/types.hh
@@ -144,10 +144,6 @@ struct summary_ka {
    uint64_t memory_footprint() const {
        return sizeof(summary_entry) * entries.size() + sizeof(uint32_t) * positions.size() + sizeof(*this);
    }
-
-    explicit operator bool() const {
-        return entries.size();
-    }
 };
 using summary = summary_ka;

@@ -266,13 +262,6 @@ struct deletion_time {
               (marked_for_delete_at == std::numeric_limits<int64_t>::min());
    }

-    bool operator==(const deletion_time& d) {
-        return local_deletion_time == d.local_deletion_time &&
-               marked_for_delete_at == d.marked_for_delete_at;
-    }
-    bool operator!=(const deletion_time& d) {
-        return !(*this == d);
-    }
    explicit operator tombstone() {
        return tombstone(marked_for_delete_at, gc_clock::time_point(gc_clock::duration(local_deletion_time)));
    }
--- a/streaming/stream_manager.cc
+++ b/streaming/stream_manager.cc
@@ -40,7 +40,6 @@
 #include "streaming/stream_manager.hh"
 #include "streaming/stream_result_future.hh"
 #include "log.hh"
-#include "streaming/stream_session_state.hh"

 namespace streaming {

@@ -216,32 +215,4 @@ future<stream_bytes> stream_manager::get_progress_on_all_shards() {
    );
 }

-void stream_manager::fail_sessions(inet_address endpoint) {
-    for (auto sr : get_all_streams()) {
-        for (auto session : sr->get_coordinator()->get_all_stream_sessions()) {
-            if (session->peer == endpoint) {
-                session->close_session(stream_session_state::FAILED);
-            }
-        }
-    }
-}
-
-void stream_manager::on_remove(inet_address endpoint) {
-    sslog.info("stream_manager: Close all stream_session with peer = {} in on_remove", endpoint);
-    get_stream_manager().invoke_on_all([endpoint] (auto& sm) {
-        sm.fail_sessions(endpoint);
-    }).handle_exception([endpoint] (auto ep) {
-        sslog.warn("stream_manager: Fail to close sessions peer = {} in on_remove", endpoint);
-    });
-}
-
-void stream_manager::on_restart(inet_address endpoint, endpoint_state ep_state) {
-    sslog.info("stream_manager: Close all stream_session with peer = {} in on_restart", endpoint);
-    get_stream_manager().invoke_on_all([endpoint] (auto& sm) {
-        sm.fail_sessions(endpoint);
-    }).handle_exception([endpoint] (auto ep) {
-        sslog.warn("stream_manager: Fail to close sessions peer = {} in on_restart", endpoint);
-    });
-}
-
 } // namespace streaming
--- a/streaming/stream_manager.hh
+++ b/streaming/stream_manager.hh
@@ -41,10 +41,6 @@
 #include "core/shared_ptr.hh"
 #include "core/distributed.hh"
 #include "utils/UUID.hh"
-#include "gms/i_endpoint_state_change_subscriber.hh"
-#include "gms/inet_address.hh"
-#include "gms/endpoint_state.hh"
-#include "gms/application_state.hh"
 #include <seastar/core/semaphore.hh>
 #include <map>

@@ -78,12 +74,8 @@ struct stream_bytes {
 *
 * All stream operation should be created through this class to track streaming status and progress.
 */
-class stream_manager : public gms::i_endpoint_state_change_subscriber, public enable_shared_from_this<stream_manager> {
+class stream_manager {
    using UUID = utils::UUID;
-    using inet_address = gms::inet_address;
-    using endpoint_state = gms::endpoint_state;
-    using application_state = gms::application_state;
-    using versioned_value = gms::versioned_value;
    /*
     * Currently running streams. Removed after completion/failure.
     * We manage them in two different maps to distinguish plan from initiated ones to
@@ -142,18 +134,6 @@ public:
    future<stream_bytes> get_progress_on_all_shards(gms::inet_address peer);

    future<stream_bytes> get_progress_on_all_shards();
-
-public:
-    virtual void on_join(inet_address endpoint, endpoint_state ep_state) override {}
-    virtual void before_change(inet_address endpoint, endpoint_state current_state, application_state new_state_key, const versioned_value& new_value) override {}
-    virtual void on_change(inet_address endpoint, application_state state, const versioned_value& value) override {}
-    virtual void on_alive(inet_address endpoint, endpoint_state state) override {}
-    virtual void on_dead(inet_address endpoint, endpoint_state state) override {}
-    virtual void on_remove(inet_address endpoint) override;
-    virtual void on_restart(inet_address endpoint, endpoint_state ep_state) override;
-
-private:
-    void fail_sessions(inet_address endpoint);
 };

 extern distributed<stream_manager> _the_stream_manager;
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -103,6 +103,8 @@ void stream_session::init_messaging_service_handler() {
            auto session = get_session(plan_id, from, "PREPARE_MESSAGE");
            session->init(sr);
            session->dst_cpu_id = src_cpu_id;
+            sslog.debug("[Stream #{}] GOT PREPARE_MESSAGE from {}: get session peer={}, dst_cpu_id={}",
+                session->plan_id(), from, session->peer, session->dst_cpu_id);
            return session->prepare(std::move(msg.requests), std::move(msg.summaries));
        });
    });
@@ -119,29 +121,8 @@ void stream_session::init_messaging_service_handler() {
        return do_with(std::move(fm), [plan_id, from] (const auto& fm) {
            auto fm_size = fm.representation().size();
            get_local_stream_manager().update_progress(plan_id, from.addr, progress_info::direction::IN, fm_size);
-            return service::get_schema_for_write(fm.schema_version(), from).then([plan_id, from, &fm] (schema_ptr s) {
-                auto cf_id = fm.column_family_id();
-                sslog.debug("[Stream #{}] GOT STREAM_MUTATION from {}: cf_id={}", plan_id, from.addr, cf_id);
-
-                auto& db = service::get_local_storage_proxy().get_db().local();
-                if (!db.column_family_exists(cf_id)) {
-                    sslog.warn("[Stream #{}] STREAM_MUTATION from {}: cf_id={} is missing, assume the table is dropped",
-                                plan_id, from.addr, cf_id);
-                    return make_ready_future<>();
-                }
-                return service::get_storage_proxy().local().mutate_streaming_mutation(std::move(s), fm).then_wrapped([plan_id, cf_id, from] (auto&& f) {
-                    try {
-                        f.get();
-                        return make_ready_future<>();
-                    } catch (no_such_column_family) {
-                        sslog.warn("[Stream #{}] STREAM_MUTATION from {}: cf_id={} is missing, assume the table is dropped",
-                                plan_id, from.addr, cf_id);
-                        return make_ready_future<>();
-                    } catch (...) {
-                        throw;
-                    }
-                    return make_ready_future<>();
-                });
+            return service::get_schema_for_write(fm.schema_version(), from).then([&fm] (schema_ptr s) {
+                return service::get_storage_proxy().local().mutate_locally(std::move(s), fm);
            });
        });
    });
@@ -149,29 +130,12 @@ void stream_session::init_messaging_service_handler() {
        const auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
        return smp::submit_to(dst_cpu_id, [ranges = std::move(ranges), plan_id, cf_id, from] () mutable {
            auto session = get_session(plan_id, from, "STREAM_MUTATION_DONE", cf_id);
-            return session->get_db().invoke_on_all([ranges = std::move(ranges), plan_id, from, cf_id] (database& db) {
-                if (!db.column_family_exists(cf_id)) {
-                    sslog.warn("[Stream #{}] STREAM_MUTATION_DONE from {}: cf_id={} is missing, assume the table is dropped",
-                                plan_id, from, cf_id);
-                    return make_ready_future<>();
+            session->receive_task_completed(cf_id);
+            return session->get_db().invoke_on_all([ranges = std::move(ranges), cf_id] (database& db) {
+                auto& cf = db.find_column_family(cf_id);
+                for (auto& range : ranges) {
+                    cf.get_row_cache().invalidate(query::to_partition_range(range));
                }
-                std::vector<query::partition_range> query_ranges;
-                try {
-                    auto& cf = db.find_column_family(cf_id);
-                    query_ranges.reserve(ranges.size());
-                    for (auto& range : ranges) {
-                        query_ranges.push_back(query::to_partition_range(range));
-                    }
-                    return cf.flush_streaming_mutations(std::move(query_ranges));
-                } catch (no_such_column_family) {
-                    sslog.warn("[Stream #{}] STREAM_MUTATION_DONE from {}: cf_id={} is missing, assume the table is dropped",
-                                plan_id, from, cf_id);
-                    return make_ready_future<>();
-                } catch (...) {
-                    throw;
-                }
-            }).then([session, cf_id] {
-                session->receive_task_completed(cf_id);
            });
        });
    });
@@ -202,13 +166,68 @@ future<> stream_session::init_streaming_service(distributed<database>& db) {
    //     return get_stream_manager().stop();
    // });
    return get_stream_manager().start().then([] {
-        gms::get_local_gossiper().register_(get_local_stream_manager().shared_from_this());
        return _db->invoke_on_all([] (auto& db) {
            init_messaging_service_handler();
        });
    });
 }

+future<> stream_session::test(distributed<cql3::query_processor>& qp) {
+    if (utils::fb_utilities::get_broadcast_address() == inet_address("127.0.0.1")) {
+        auto tester = make_shared<timer<lowres_clock>>();
+        tester->set_callback ([tester, &qp] {
+            seastar::async([&qp] {
+                sslog.debug("================ STREAM_PLAN TEST ==============");
+                auto cs = service::client_state::for_external_calls();
+                service::query_state qs(cs);
+                auto& opts = cql3::query_options::DEFAULT;
+                qp.local().process("CREATE KEYSPACE ks WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };", qs, opts).get();
+                sslog.debug("CREATE KEYSPACE = KS DONE");
+                sleep(std::chrono::seconds(3)).get();
+                qp.local().process("CREATE TABLE ks.tb ( key text PRIMARY KEY, C0 text, C1 text, C2 text, C3 blob, C4 text);", qs, opts).get();
+                sslog.debug("CREATE TABLE = TB DONE");
+                sleep(std::chrono::seconds(3)).get();
+                qp.local().process("insert into ks.tb (key,c0) values ('1','1');", qs, opts).get();
+                sslog.debug("INSERT VALUE DONE: 1");
+                qp.local().process("insert into ks.tb (key,c0) values ('2','2');", qs, opts).get();
+                sslog.debug("INSERT VALUE DONE: 2");
+                qp.local().process("insert into ks.tb (key,c0) values ('3','3');", qs, opts).get();
+                sslog.debug("INSERT VALUE DONE: 3");
+                qp.local().process("insert into ks.tb (key,c0) values ('4','4');", qs, opts).get();
+                sslog.debug("INSERT VALUE DONE: 4");
+                qp.local().process("insert into ks.tb (key,c0) values ('5','5');", qs, opts).get();
+                sslog.debug("INSERT VALUE DONE: 5");
+                qp.local().process("insert into ks.tb (key,c0) values ('6','6');", qs, opts).get();
+                sslog.debug("INSERT VALUE DONE: 6");
+            }).then([] {
+                sleep(std::chrono::seconds(10)).then([] {
+                    sslog.debug("================ START STREAM  ==============");
+                    auto sp = stream_plan("MYPLAN");
+                    auto to = inet_address("127.0.0.2");
+                    auto tb = sstring("tb");
+                    auto ks = sstring("ks");
+                    std::vector<query::range<token>> ranges = {query::range<token>::make_open_ended_both_sides()};
+                    std::vector<sstring> cfs{tb};
+                    sp.transfer_ranges(to, ks, ranges, cfs).request_ranges(to, ks, ranges, cfs).execute().then_wrapped([] (auto&& f) {
+                        try {
+                            auto state = f.get0();
+                            sslog.debug("plan_id={} description={} DONE", state.plan_id, state.description);
+                            sslog.debug("================ FINISH STREAM  ==============");
+                        } catch (const stream_exception& e) {
+                            auto& state = e.state;
+                            sslog.debug("plan_id={} description={} FAIL: {}", state.plan_id, state.description, e.what());
+                            sslog.error("================ FAIL   STREAM  ==============");
+                        }
+                    });
+                });
+            });
+        });
+        tester->arm(std::chrono::seconds(10));
+    }
+    return make_ready_future<>();
+}
+
+
 future<> stream_session::on_initialization_complete() {
    // send prepare message
    set_state(stream_session_state::PREPARING);
@@ -412,7 +431,7 @@ std::vector<column_family*> stream_session::get_column_family_stores(const sstri
            auto cf_name = cf.schema()->cf_name();
            auto ks_name = cf.schema()->ks_name();
            if (ks_name == keyspace) {
-                sslog.debug("Find ks={} cf={}", ks_name, cf_name);
+                sslog.info("Find ks={} cf={}", ks_name, cf_name);
                stores.push_back(&cf);
            }
        }
--- a/streaming/stream_session.hh
+++ b/streaming/stream_session.hh
@@ -126,12 +126,15 @@ class stream_result_future;
 *       session is done is is closed (closeSession()). Otherwise, the node switch to the WAIT_COMPLETE state and
 *       send a CompleteMessage to the other side.
 */
-class stream_session : public enable_shared_from_this<stream_session> {
+class stream_session : public gms::i_endpoint_state_change_subscriber, public enable_shared_from_this<stream_session> {
 private:
    using messaging_verb = net::messaging_verb;
    using messaging_service = net::messaging_service;
    using msg_addr = net::messaging_service::msg_addr;
    using inet_address = gms::inet_address;
+    using endpoint_state = gms::endpoint_state;
+    using application_state = gms::application_state;
+    using versioned_value = gms::versioned_value;
    using UUID = utils::UUID;
    using token = dht::token;
    using ring_position = dht::ring_position;
@@ -144,6 +147,7 @@ public:
    static database& get_local_db() { return _db->local(); }
    static distributed<database>& get_db() { return *_db; };
    static future<> init_streaming_service(distributed<database>& db);
+    static future<> test(distributed<cql3::query_processor>& qp);
 public:
    /**
     * Streaming endpoint.
@@ -256,7 +260,7 @@ public:
    void add_transfer_ranges(sstring keyspace, std::vector<query::range<token>> ranges, std::vector<sstring> column_families);

    std::vector<column_family*> get_column_family_stores(const sstring& keyspace, const std::vector<sstring>& column_families);
-
+private:
    void close_session(stream_session_state final_state);

 public:
@@ -333,6 +337,16 @@ public:

    void receive_task_completed(UUID cf_id);
    void transfer_task_completed(UUID cf_id);
+
+public:
+    virtual void on_join(inet_address endpoint, endpoint_state ep_state) override {}
+    virtual void before_change(inet_address endpoint, endpoint_state current_state, application_state new_state_key, const versioned_value& new_value) override {}
+    virtual void on_change(inet_address endpoint, application_state state, const versioned_value& value) override {}
+    virtual void on_alive(inet_address endpoint, endpoint_state state) override {}
+    virtual void on_dead(inet_address endpoint, endpoint_state state) override {}
+    virtual void on_remove(inet_address endpoint) override { close_session(stream_session_state::FAILED); }
+    virtual void on_restart(inet_address endpoint, endpoint_state ep_state) override { close_session(stream_session_state::FAILED); }
+
 private:
    void send_complete_message();
    bool maybe_completed();
--- a/streaming/stream_transfer_task.cc
+++ b/streaming/stream_transfer_task.cc
@@ -71,7 +71,6 @@ struct send_info {
    uint32_t dst_cpu_id;
    size_t mutations_nr{0};
    semaphore mutations_done{0};
-    bool error_logged = false;
    send_info(database& db_, utils::UUID plan_id_, utils::UUID cf_id_,
              query::partition_range pr_, net::messaging_service::msg_addr id_,
              uint32_t dst_cpu_id_)
@@ -88,18 +87,24 @@ future<stop_iteration> do_send_mutations(auto si, auto fm) {
    return get_local_stream_manager().mutation_send_limiter().wait().then([si, fm = std::move(fm)] () mutable {
        sslog.debug("[Stream #{}] SEND STREAM_MUTATION to {}, cf_id={}", si->plan_id, si->id, si->cf_id);
        auto fm_size = fm.representation().size();
-        net::get_local_messaging_service().send_stream_mutation(si->id, si->plan_id, std::move(fm), si->dst_cpu_id).then([si, fm_size] {
-            sslog.debug("[Stream #{}] GOT STREAM_MUTATION Reply from {}", si->plan_id, si->id.addr);
-            get_local_stream_manager().update_progress(si->plan_id, si->id.addr, progress_info::direction::OUT, fm_size);
-            si->mutations_done.signal();
-        }).handle_exception([si] (auto ep) {
-            // There might be larger number of STREAM_MUTATION inflight.
-            // Log one error per column_family per range
-            if (!si->error_logged) {
-                si->error_logged = true;
-                sslog.error("[Stream #{}] stream_transfer_task: Fail to send STREAM_MUTATION to {}: {}", si->plan_id, si->id, ep);
+        net::get_local_messaging_service().send_stream_mutation(si->id, si->plan_id, std::move(fm), si->dst_cpu_id).then_wrapped([si, fm_size] (auto&& f) {
+            try {
+                f.get();
+                sslog.debug("[Stream #{}] GOT STREAM_MUTATION Reply from {}", si->plan_id, si->id.addr);
+                get_local_stream_manager().update_progress(si->plan_id, si->id.addr, progress_info::direction::OUT, fm_size);
+                si->mutations_done.signal();
+            } catch (std::exception& e) {
+                auto err = std::string(e.what());
+                // Seastar RPC does not provide exception type info, so we can not catch no_such_column_family here
+                // Need to compare the exception error msg
+                if (err.find("Can't find a column family with UUID") != std::string::npos) {
+                    sslog.info("[Stream #{}] remote node {} does not have the cf_id = {}", si->plan_id, si->id, si->cf_id);
+                    si->mutations_done.signal();
+                } else {
+                    sslog.error("[Stream #{}] stream_transfer_task: Fail to send STREAM_MUTATION to {}: {}", si->plan_id, si->id, err);
+                    si->mutations_done.broken();
+                }
            }
-            si->mutations_done.broken();
        }).finally([] {
            get_local_stream_manager().mutation_send_limiter().signal();
        });
@@ -109,11 +114,11 @@ future<stop_iteration> do_send_mutations(auto si, auto fm) {

 future<> send_mutations(auto si) {
    auto& cf = si->db.find_column_family(si->cf_id);
-    auto& priority = service::get_local_streaming_read_priority();
+    auto& priority = service::get_local_mutation_stream_priority();
    return do_with(cf.make_reader(cf.schema(), si->pr, priority), [si] (auto& reader) {
        return repeat([si, &reader] () {
            return reader().then([si] (auto mopt) {
-                if (mopt && si->db.column_family_exists(si->cf_id)) {
+                if (mopt) {
                    si->mutations_nr++;
                    auto fm = frozen_mutation(*mopt);
                    return do_send_mutations(si, std::move(fm));
@@ -151,11 +156,7 @@ void stream_transfer_task::start() {
        });
    }).then([this, plan_id, cf_id, id] {
        sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, cf_id={}", plan_id, id, cf_id);
-        return session->ms().send_stream_mutation_done(id, plan_id, _ranges,
-                cf_id, session->dst_cpu_id).handle_exception([plan_id, id, cf_id] (auto ep) {
-            sslog.error("[Stream #{}] stream_transfer_task: Fail to send STREAM_MUTATION_DONE to {}: {}", plan_id, id, ep);
-            std::rethrow_exception(ep);
-        });
+        return session->ms().send_stream_mutation_done(id, plan_id, _ranges, cf_id, session->dst_cpu_id);
    }).then([this, id, plan_id, cf_id] {
        sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE Reply from {}", plan_id, id.addr);
        session->start_keep_alive_timer();
--- a/test.py
+++ b/test.py
@@ -32,7 +32,6 @@ boost_tests = [
    'types_test',
    'keys_test',
    'mutation_test',
-    'schema_registry_test',
    'range_test',
    'mutation_reader_test',
    'cql_query_test',
--- a/tests/cql_query_test.cc
+++ b/tests/cql_query_test.cc
@@ -2098,24 +2098,6 @@ SEASTAR_TEST_CASE(test_alter_table) {
        });
    });
 }
-
-SEASTAR_TEST_CASE(test_map_query) {
-    return do_with_cql_env([] (auto& e) {
-        return seastar::async([&e] {
-            e.execute_cql("CREATE TABLE xx (k int PRIMARY KEY, m map<text, int>);").get();
-            e.execute_cql("insert into xx (k, m) values (0, {'v2': 1});").get();
-            auto m_type = map_type_impl::get_instance(utf8_type, int32_type, true);
-            assert_that(e.execute_cql("select m from xx where k = 0;").get0())
-                    .is_rows().with_rows({
-                        { make_map_value(m_type, map_type_impl::native_type({{sstring("v2"), 1}})).serialize() }
-                    });
-            e.execute_cql("delete m['v2'] from xx where k = 0;").get();
-            assert_that(e.execute_cql("select m from xx where k = 0;").get0())
-                    .is_rows().with_rows({{{}}});
-        });
-    });
-}
-
 SEASTAR_TEST_CASE(test_drop_table) {
    return do_with_cql_env([] (auto& e) {
        return seastar::async([&e] {
@@ -2127,40 +2109,6 @@ SEASTAR_TEST_CASE(test_drop_table) {
    });
 }

-SEASTAR_TEST_CASE(test_reversed_slice_with_empty_range_before_all_rows) {
-    return do_with_cql_env([] (auto& e) {
-        return seastar::async([&e] {
-            e.execute_cql("CREATE TABLE test (a int, b int, c int, s1 int static, s2 int static, PRIMARY KEY (a, b));").get();
-
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 0, 0, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 1, 1, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 2, 2, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 3, 3, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 4, 4, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 5, 5, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 6, 6, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 7, 7, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 8, 8, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 9, 9, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 10, 10, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 11, 11, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 12, 12, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 13, 13, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 14, 14, 17, 42);").get();
-            e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 15, 15, 17, 42);").get();
-
-            assert_that(e.execute_cql("select * from test WHERE a = 99 and b < 0 ORDER BY b DESC limit 2;").get0())
-                .is_rows().is_empty();
-
-            assert_that(e.execute_cql("select * from test WHERE a = 99 order by b desc;").get0())
-                .is_rows().with_size(16);
-
-            assert_that(e.execute_cql("select * from test;").get0())
-                .is_rows().with_size(16);
-        });
-    });
-}
-
 SEASTAR_TEST_CASE(test_alter_table_validation) {
    return do_with_cql_env([] (auto& e) {
        return e.execute_cql("create table tatv (p1 int, c1 int, c2 int, r1 int, r2 set<int>, PRIMARY KEY (p1, c1, c2));").discard_result().then_wrapped([&e] (auto f) {
--- a/tests/cql_test_env.cc
+++ b/tests/cql_test_env.cc
@@ -333,6 +333,7 @@ public:

            gms::get_gossiper().stop().get();
            gms::get_failure_detector().stop().get();
+            net::get_messaging_service().stop().get();

            _db->stop().get();

@@ -342,8 +343,6 @@ public:

            sstables::await_background_jobs_on_all_shards().get();

-            net::get_messaging_service().stop().get();
-
            bool old_active = true;
            assert(active.compare_exchange_strong(old_active, false));
        });
--- a/tests/gossip_test.cc
+++ b/tests/gossip_test.cc
@@ -46,9 +46,9 @@ SEASTAR_TEST_CASE(test_boot_shutdown){
        gms::get_gossiper().start().get();
        gms::get_gossiper().stop().get();
        gms::get_failure_detector().stop().get();
+        net::get_messaging_service().stop().get();
        db.stop().get();
        service::get_storage_service().stop().get();
-        net::get_messaging_service().stop().get();
        locator::i_endpoint_snitch::stop_snitch().get();
    });
 }
--- a/tests/logalloc_test.cc
+++ b/tests/logalloc_test.cc
@@ -108,7 +108,7 @@ SEASTAR_TEST_CASE(test_compaction_with_multiple_regions) {
            }
        });

-        size_t quarter = shard_tracker().region_occupancy().total_space() / 4;
+        size_t quarter = shard_tracker().occupancy().total_space() / 4;

        shard_tracker().reclaim_all_free_segments();

--- a/tests/memory_footprint.cc
+++ b/tests/memory_footprint.cc
@@ -181,7 +181,7 @@ static sizes calculate_sizes(const mutation& m) {
    result.cache = tracker.region().occupancy().used_space();
    result.frozen = freeze(m).representation().size();
    result.canonical = canonical_mutation(m).representation().size();
-    result.query_result = m.query(partition_slice_builder(*s).build(), query::result_request::only_result).buf().size();
+    result.query_result = m.query(partition_slice_builder(*s).build()).buf().size();

    tmpdir sstable_dir;
    auto sst = make_lw_shared<sstables::sstable>(s->ks_name(), s->cf_name(),
--- a/tests/mutation_source_test.cc
+++ b/tests/mutation_source_test.cc
@@ -298,15 +298,6 @@ static mutation_sets generate_mutation_sets() {
        }
    }

-    {
-        random_mutation_generator gen;
-        for (int i = 0; i < 10; ++i) {
-            auto m = gen();
-            result.unequal.emplace_back(mutations{m, gen()}); // collision unlikely
-            result.equal.emplace_back(mutations{m, m});
-        }
-    }
-
    return result;
 }

@@ -350,145 +341,3 @@ void for_each_mutation(std::function<void(const mutation&)> callback) {
        }
    }
 }
-
-bytes make_blob(size_t blob_size) {
-    static thread_local std::independent_bits_engine<std::default_random_engine, 8, uint8_t> random_bytes;
-    bytes big_blob(bytes::initialized_later(), blob_size);
-    for (auto&& b : big_blob) {
-        b = random_bytes();
-    }
-    return big_blob;
-};
-
-class random_mutation_generator::impl {
-    friend class random_mutation_generator;
-    const size_t _external_blob_size = 128; // Should be enough to force use of external bytes storage
-    const column_id column_count = row::max_vector_size * 2;
-    std::mt19937 _gen;
-    schema_ptr _schema;
-    std::vector<bytes> _blobs;
-
-    static gc_clock::time_point expiry_dist(auto& gen) {
-        static thread_local std::uniform_int_distribution<int> dist(0, 2);
-        return gc_clock::time_point() + std::chrono::seconds(dist(gen));
-    }
-
-public:
-    schema_ptr make_schema() {
-        auto builder = schema_builder("ks", "cf")
-                .with_column("pk", bytes_type, column_kind::partition_key)
-                .with_column("ck1", bytes_type, column_kind::clustering_key)
-                .with_column("ck2", bytes_type, column_kind::clustering_key);
-
-        // Create enough columns so that row can overflow its vector storage
-        for (column_id i = 0; i < column_count; ++i) {
-            {
-                auto column_name = sprint("v%d", i);
-                builder.with_column(to_bytes(column_name), bytes_type, column_kind::regular_column);
-            }
-            {
-                auto column_name = sprint("s%d", i);
-                builder.with_column(to_bytes(column_name), bytes_type, column_kind::static_column);
-            }
-        }
-
-        return builder.build();
-    }
-
-    impl() {
-        _schema = make_schema();
-
-        for (int i = 0; i < 1024; ++i) {
-            _blobs.emplace_back(make_blob(_external_blob_size));
-        }
-
-        std::random_device rd;
-        // In case of errors, replace the seed with a fixed value to get a deterministic run.
-        auto seed = rd();
-        BOOST_TEST_MESSAGE(sprint("Random seed: %s", seed));
-        _gen = std::mt19937(seed);
-    }
-
-    mutation operator()() {
-        std::uniform_int_distribution<column_id> column_count_dist(1, column_count);
-        std::uniform_int_distribution<column_id> column_id_dist(0, column_count - 1);
-        std::uniform_int_distribution<size_t> value_blob_index_dist(0, 2);
-        std::normal_distribution<> ck_index_dist(_blobs.size() / 2, 1.5);
-        std::uniform_int_distribution<int> bool_dist(0, 1);
-
-        std::uniform_int_distribution<api::timestamp_type> timestamp_dist(api::min_timestamp, api::min_timestamp + 2); // 3 values
-
-        auto pkey = partition_key::from_single_value(*_schema, _blobs[0]);
-        mutation m(pkey, _schema);
-
-        auto set_random_cells = [&] (row& r, column_kind kind) {
-            auto columns_to_set = column_count_dist(_gen);
-            for (column_id i = 0; i < columns_to_set; ++i) {
-                // FIXME: generate expiring cells
-                auto cell = bool_dist(_gen)
-                            ? atomic_cell::make_live(timestamp_dist(_gen), _blobs[value_blob_index_dist(_gen)])
-                            : atomic_cell::make_dead(timestamp_dist(_gen), expiry_dist(_gen));
-                r.apply(_schema->column_at(kind, column_id_dist(_gen)), std::move(cell));
-            }
-        };
-
-        auto random_tombstone = [&] {
-            return tombstone(timestamp_dist(_gen), expiry_dist(_gen));
-        };
-
-        auto random_row_marker = [&] {
-            static thread_local std::uniform_int_distribution<int> dist(0, 3);
-            switch (dist(_gen)) {
-                case 0: return row_marker();
-                case 1: return row_marker(random_tombstone());
-                case 2: return row_marker(timestamp_dist(_gen));
-                case 3: return row_marker(timestamp_dist(_gen), std::chrono::seconds(1), expiry_dist(_gen));
-                default: assert(0);
-            }
-        };
-
-        if (bool_dist(_gen)) {
-            m.partition().apply(random_tombstone());
-        }
-
-        set_random_cells(m.partition().static_row(), column_kind::static_column);
-
-        auto random_blob = [&] {
-            return _blobs[std::min(_blobs.size() - 1, static_cast<size_t>(std::max(0.0, ck_index_dist(_gen))))];
-        };
-
-        auto row_count_dist = [&] (auto& gen) {
-            static thread_local std::normal_distribution<> dist(32, 1.5);
-            return static_cast<size_t>(std::min(100.0, std::max(0.0, dist(gen))));
-        };
-
-        size_t row_count = row_count_dist(_gen);
-        for (size_t i = 0; i < row_count; ++i) {
-            auto ckey = clustering_key::from_exploded(*_schema, {random_blob(), random_blob()});
-            deletable_row& row = m.partition().clustered_row(ckey);
-            set_random_cells(row.cells(), column_kind::regular_column);
-            row.marker() = random_row_marker();
-        }
-
-        size_t range_tombstone_count = row_count_dist(_gen);
-        for (size_t i = 0; i < range_tombstone_count; ++i) {
-            auto key = clustering_key::from_exploded(*_schema, {random_blob()});
-            m.partition().apply_row_tombstone(*_schema, key, random_tombstone());
-        }
-        return m;
-    }
-};
-
-random_mutation_generator::~random_mutation_generator() {}
-
-random_mutation_generator::random_mutation_generator()
-    : _impl(std::make_unique<random_mutation_generator::impl>())
-{ }
-
-mutation random_mutation_generator::operator()() {
-    return (*_impl)();
-}
-
-schema_ptr random_mutation_generator::schema() const {
-    return _impl->_schema;
-}
--- a/tests/mutation_source_test.hh
+++ b/tests/mutation_source_test.hh
@@ -36,15 +36,3 @@ void for_each_mutation_pair(std::function<void(const mutation&, const mutation&,

 // Calls the provided function on mutations. Is supposed to exercise as many differences as possible.
 void for_each_mutation(std::function<void(const mutation&)>);
-
-class random_mutation_generator {
-    class impl;
-    std::unique_ptr<impl> _impl;
-public:
-    random_mutation_generator();
-    ~random_mutation_generator();
-    mutation operator()();
-    schema_ptr schema() const;
-};
-
-bytes make_blob(size_t blob_size);
--- a/Show More
+++ b/Show More