sstables: fix use-after-free in read_simple()

`r` is moved-from, and later captured in a different lambda. The compiler may choose to move and perform the other capture later, resulting in a use-after-free. Fix by copying `r` instead of moving it. Discovered by sstable_test in debug mode. Message-Id: <20170702082546.20570-1-avi@scylladb.com> (cherry picked from commit 07b8adce0e)
Update seastar submodule
2018-02-01 14:28:59 +01:00 · 2018-01-29 15:26:24 +02:00 · 2018-01-16 15:55:09 +02:00 · 2017-12-16 22:05:38 +02:00 · 2017-12-13 10:26:07 +02:00 · 2017-11-26 10:40:23 +02:00
149 changed files with 4173 additions and 1465 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=1.7.5

 if test -f version
 then
--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -246,7 +246,8 @@ future<> auth::auth::setup() {
        std::map<sstring, sstring> opts;
        opts["replication_factor"] = "1";
        auto ksm = keyspace_metadata::new_keyspace(AUTH_KS, "org.apache.cassandra.locator.SimpleStrategy", opts, true);
-        f = service::get_local_migration_manager().announce_new_keyspace(ksm, false);
+        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments. See issue #2129.
+        f = service::get_local_migration_manager().announce_new_keyspace(ksm, api::min_timestamp, false);
    }

    return f.then([] {
--- a/cell_locking.hh
+++ b/cell_locking.hh
@@ -22,13 +22,28 @@
 #pragma once

 #include <boost/intrusive/unordered_set.hpp>
+
+#if __has_include(<boost/container/small_vector.hpp>)
+
 #include <boost/container/small_vector.hpp>

+template <typename T, size_t N>
+using small_vector = boost::container::small_vector<T, N>;
+
+#else
+
+#include <vector>
+template <typename T, size_t N>
+using small_vector = std::vector<T>;
+
+#endif
+
 #include "fnv1a_hasher.hh"
+#include "streamed_mutation.hh"
 #include "mutation_partition.hh"

 class cells_range {
-    using ids_vector_type = boost::container::small_vector<column_id, 5>;
+    using ids_vector_type = small_vector<column_id, 5>;

    position_in_partition_view _position;
    ids_vector_type _ids;
@@ -147,7 +162,7 @@ class cell_locker {
        // temporarily removed from its parent partition_entry.
        // Returns true if the cell_entry still exist in the new schema and
        // should be reinserted.
-        bool upgrade(const schema& from, const schema& to, column_kind kind) {
+        bool upgrade(const schema& from, const schema& to, column_kind kind) noexcept {
            auto& old_column_mapping = from.get_column_mapping();
            auto& column = old_column_mapping.column_at(kind, _address.id);
            auto cdef = to.get_column_definition(column.name());
@@ -170,7 +185,9 @@ class cell_locker {
        }

        ~cell_entry() {
-            assert(is_linked());
+            if (!is_linked()) {
+                return;
+            }
            unlink();
            if (!--_parent._cell_count) {
                delete &_parent;
@@ -286,10 +303,9 @@ class cell_locker {
        };

        class equal_compare {
-            schema_ptr _schema;
            dht::decorated_key_equals_comparator _cmp;
        public:
-            explicit equal_compare(const schema s) : _cmp(s) { }
+            explicit equal_compare(const schema& s) : _cmp(s) { }
            bool operator()(const dht::decorated_key& dk, const partition_entry& pe) {
                return _cmp(dk, pe._key);
            }
@@ -386,22 +402,19 @@ struct cell_locker::locker {

    partition_cells_range _range;
    partition_cells_range::iterator _current_ck;
-    cells_range _cells_range;
    cells_range::const_iterator _current_cell;

    std::vector<locked_cell> _locks;
 private:
    void update_ck() {
        if (!is_done()) {
-            _cells_range = *_current_ck;
-            _current_cell = _cells_range.begin();
+            _current_cell = _current_ck->begin();
        }
    }

    future<> lock_next();

    bool is_done() const { return _current_ck == _range.end(); }
-    std::vector<locked_cell> get() && { return std::move(_locks); }
 public:
    explicit locker(const ::schema& s, partition_entry& pe, partition_cells_range&& range)
        : _hasher(s)
@@ -413,18 +426,22 @@ public:
        update_ck();
    }

-    future<std::vector<locked_cell>> lock_all() && {
+    locker(const locker&) = delete;
+    locker(locker&&) = delete;
+
+    future<> lock_all() {
        // Cannot defer before first call to lock_next().
        return lock_next().then([this] {
            return do_until([this] { return is_done(); }, [this] {
                return lock_next();
-            }).then([&] {
-                return std::move(*this).get();
            });
        });
    }
+
+    std::vector<locked_cell> get() && { return std::move(_locks); }
 };

+inline
 future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_key& dk, partition_cells_range&& range) {
    partition_entry::hasher pe_hash;
    partition_entry::equal_compare pe_eq(*_schema);
@@ -460,14 +477,17 @@ future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_ke
        return make_ready_future<std::vector<locked_cell>>(std::move(locks));
    }

-    return do_with(locker(*_schema, *it, std::move(range)), [] (auto& locker)  mutable {
-        return std::move(locker).lock_all();
+    auto l = std::make_unique<locker>(*_schema, *it, std::move(range));
+    auto f = l->lock_all();
+    return f.then([l = std::move(l)] {
+        return std::move(*l).get();
    });
 }

+inline
 future<> cell_locker::locker::lock_next() {
    while (!is_done()) {
-        if (_current_cell == _cells_range.end() || _cells_range.empty()) {
+        if (_current_cell == _current_ck->end()) {
            ++_current_ck;
            update_ck();
            continue;
@@ -475,7 +495,7 @@ future<> cell_locker::locker::lock_next() {

        auto cid = *_current_cell++;

-        cell_address ca { position_in_partition(_cells_range.position()), cid };
+        cell_address ca { position_in_partition(_current_ck->position()), cid };
        auto it = _partition_entry.cells().find(ca, _hasher, _eq_cmp);
        if (it != _partition_entry.cells().end()) {
            return it->lock().then([this, ce = it->shared_from_this()] () mutable {
@@ -483,27 +503,25 @@ future<> cell_locker::locker::lock_next() {
            });
        }

-        auto cell = make_lw_shared<cell_entry>(_partition_entry, position_in_partition(_cells_range.position()), cid);
+        auto cell = make_lw_shared<cell_entry>(_partition_entry, position_in_partition(_current_ck->position()), cid);
        _partition_entry.insert(cell);
        _locks.emplace_back(std::move(cell));
    }
    return make_ready_future<>();
 }

+inline
 bool cell_locker::partition_entry::upgrade(schema_ptr new_schema) {
    if (_schema == new_schema) {
        return true;
    }

-    auto buckets = std::make_unique<cells_type::bucket_type[]>(initial_bucket_count);
+    auto buckets = std::make_unique<cells_type::bucket_type[]>(_cells.bucket_count());
    auto cells = cells_type(cells_type::bucket_traits(buckets.get(), _cells.bucket_count()),
                            cell_entry::hasher(*new_schema), cell_entry::equal_compare(*new_schema));

-    while (!_cells.empty()) {
-        auto it = _cells.begin();
-        auto& cell = *it;
-        _cells.erase(it);
-
+    _cells.clear_and_dispose([&] (cell_entry* cell_ptr) noexcept {
+        auto& cell = *cell_ptr;
        auto kind = cell.position().is_static_row() ? column_kind::static_column
                                                    : column_kind::regular_column;
        auto reinsert = cell.upgrade(*_schema, *new_schema, kind);
@@ -512,9 +530,16 @@ bool cell_locker::partition_entry::upgrade(schema_ptr new_schema) {
        } else {
            _cell_count--;
        }
-    }
+    });

+    // bi::unordered_set move assignment is actually a swap.
+    // Original _buckets cannot be destroyed before the container using them is
+    // so we need to explicitly make sure that the original _cells is no more.
    _cells = std::move(cells);
+    auto destroy = [] (auto) { };
+    destroy(std::move(cells));
+
    _buckets = std::move(buckets);
+    _schema = new_schema;
    return _cell_count;
 }
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -788,3 +788,23 @@ commitlog_total_space_in_mb: -1
 # By default, Scylla binds all interfaces to the prometheus API
 # It is possible to restrict the listening address to a specific one
 # prometheus_address: 0.0.0.0
+
+# Distribution of data among cores (shards) within a node
+#
+# Scylla distributes data within a node among shards, using a round-robin
+# strategy:
+#  [shard0] [shard1] ... [shardN-1] [shard0] [shard1] ... [shardN-1] ...
+#
+# Scylla versions 1.6 and below used just one repetition of the pattern;
+# this intefered with data placement among nodes (vnodes).
+#
+# Scylla versions 1.7 and above use 4096 repetitions of the pattern; this
+# provides for better data distribution.
+#
+# the value below is log (base 2) of the number of repetitions.
+#
+# Set to 0 to avoid rewriting all data when upgrading from Scylla 1.6 and
+# below.
+#
+# Keep at 12 for new clusters.
+murmur3_partitioner_ignore_msb_bits: 12
--- a/configure.py
+++ b/configure.py
@@ -230,6 +230,7 @@ scylla_tests = [
    'tests/virtual_reader_test',
    'tests/view_schema_test',
    'tests/counter_test',
+    'tests/cell_locker_test',
 ]

 apps = [
@@ -408,6 +409,7 @@ scylla_core = (['database.cc',
                 'cql3/selection/selector.cc',
                 'cql3/restrictions/statement_restrictions.cc',
                 'cql3/result_set.cc',
+                 'cql3/variable_specifications.cc',
                 'db/consistency_level.cc',
                 'db/system_keyspace.cc',
                 'db/schema_tables.cc',
@@ -628,7 +630,7 @@ deps['tests/sstable_test'] += ['tests/sstable_datafile_test.cc']

 deps['tests/bytes_ostream_test'] = ['tests/bytes_ostream_test.cc']
 deps['tests/input_stream_test'] = ['tests/input_stream_test.cc']
-deps['tests/UUID_test'] = ['utils/UUID_gen.cc', 'tests/UUID_test.cc']
+deps['tests/UUID_test'] = ['utils/UUID_gen.cc', 'tests/UUID_test.cc', 'utils/uuid.cc']
 deps['tests/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'tests/murmur_hash_test.cc']
 deps['tests/allocation_strategy_test'] = ['tests/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['tests/anchorless_list_test'] = ['tests/anchorless_list_test.cc']
--- a/counters.cc
+++ b/counters.cc
@@ -29,6 +29,15 @@ counter_id counter_id::local()
    return counter_id(service::get_local_storage_service().get_local_id());
 }

+bool counter_id::less_compare_1_7_4::operator()(const counter_id& a, const counter_id& b) const
+{
+    if (a._most_significant != b._most_significant) {
+        return a._most_significant < b._most_significant;
+    } else {
+        return a._least_significant < b._least_significant;
+    }
+}
+
 std::ostream& operator<<(std::ostream& os, const counter_id& id) {
    return os << id.to_uuid();
 }
@@ -42,6 +51,33 @@ std::ostream& operator<<(std::ostream& os, counter_cell_view ccv) {
    return os << "{counter_cell timestamp: " << ccv.timestamp() << " shards: {" << ::join(", ", ccv.shards()) << "}}";
 }

+void counter_cell_builder::do_sort_and_remove_duplicates()
+{
+    boost::range::sort(_shards, [] (auto& a, auto& b) { return a.id() < b.id(); });
+
+    std::vector<counter_shard> new_shards;
+    new_shards.reserve(_shards.size());
+    for (auto& cs : _shards) {
+        if (new_shards.empty() || new_shards.back().id() != cs.id()) {
+            new_shards.emplace_back(cs);
+        } else {
+            new_shards.back().apply(cs);
+        }
+    }
+    _shards = std::move(new_shards);
+    _sorted = true;
+}
+
+std::vector<counter_shard> counter_cell_view::shards_compatible_with_1_7_4() const
+{
+    auto sorted_shards = boost::copy_range<std::vector<counter_shard>>(shards());
+    counter_id::less_compare_1_7_4 cmp;
+    boost::range::sort(sorted_shards, [&] (auto& a, auto& b) {
+        return cmp(a.id(), b.id());
+    });
+    return sorted_shards;
+}
+
 bool counter_cell_view::apply_reversibly(atomic_cell_or_collection& dst, atomic_cell_or_collection& src)
 {
    // TODO: optimise for single shard existing in the other
@@ -139,8 +175,8 @@ stdx::optional<atomic_cell> counter_cell_view::difference(atomic_cell_view a, at
 void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset) {
    // FIXME: allow current_state to be frozen_mutation

-    auto transform_new_row_to_shards = [clock_offset] (auto& cr) {
-        cr.row().cells().for_each_cell([clock_offset] (auto, atomic_cell_or_collection& ac_o_c) {
+    auto transform_new_row_to_shards = [clock_offset] (auto& cells) {
+        cells.for_each_cell([clock_offset] (auto, atomic_cell_or_collection& ac_o_c) {
            auto acv = ac_o_c.as_atomic_cell();
            if (!acv.is_live()) {
                return; // continue -- we are in lambda
@@ -153,32 +189,22 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
    };

    if (!current_state) {
+        transform_new_row_to_shards(m.partition().static_row());
        for (auto& cr : m.partition().clustered_rows()) {
-            transform_new_row_to_shards(cr);
+            transform_new_row_to_shards(cr.row().cells());
        }
        return;
    }

    clustering_key::less_compare cmp(*m.schema());

-    auto& cstate = current_state->partition();
-    auto it = cstate.clustered_rows().begin();
-    auto end = cstate.clustered_rows().end();
-    for (auto& cr : m.partition().clustered_rows()) {
-        while (it != end && cmp(it->key(), cr.key())) {
-            ++it;
-        }
-        if (it == end || cmp(cr.key(), it->key())) {
-            transform_new_row_to_shards(cr);
-            continue;
-        }
-
+    auto transform_row_to_shards = [clock_offset] (auto& transformee, auto& state) {
        struct counter_shard_or_tombstone {
            stdx::optional<counter_shard> shard;
            tombstone tomb;
        };
        std::deque<std::pair<column_id, counter_shard_or_tombstone>> shards;
-        it->row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& ac_o_c) {
+        state.for_each_cell([&] (column_id id, const atomic_cell_or_collection& ac_o_c) {
            auto acv = ac_o_c.as_atomic_cell();
            if (!acv.is_live()) {
                counter_shard_or_tombstone cs_o_t { { },
@@ -194,7 +220,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
            shards.emplace_back(std::make_pair(id, counter_shard_or_tombstone { counter_shard(*cs), tombstone() }));
        });

-        cr.row().cells().for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
+        transformee.for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
            auto acv = ac_o_c.as_atomic_cell();
            if (!acv.is_live()) {
                return; // continue -- we are in lambda
@@ -224,5 +250,22 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
            }
            ac_o_c = ccb.build(acv.timestamp());
        });
+    };
+
+    transform_row_to_shards(m.partition().static_row(), current_state->partition().static_row());
+
+    auto& cstate = current_state->partition();
+    auto it = cstate.clustered_rows().begin();
+    auto end = cstate.clustered_rows().end();
+    for (auto& cr : m.partition().clustered_rows()) {
+        while (it != end && cmp(it->key(), cr.key())) {
+            ++it;
+        }
+        if (it == end || cmp(cr.key(), it->key())) {
+            transform_new_row_to_shards(cr.row().cells());
+            continue;
+        }
+
+        transform_row_to_shards(cr.row().cells(), it->row().cells());
    }
 }
--- a/counters.hh
+++ b/counters.hh
@@ -36,6 +36,10 @@ class counter_id {
    int64_t _least_significant;
    int64_t _most_significant;
 public:
+    static_assert(std::is_same<decltype(std::declval<utils::UUID>().get_least_significant_bits()), int64_t>::value
+            &&  std::is_same<decltype(std::declval<utils::UUID>().get_most_significant_bits()), int64_t>::value,
+        "utils::UUID is expected to work with two signed 64-bit integers");
+
    counter_id() = default;
    explicit counter_id(utils::UUID uuid) noexcept
        : _least_significant(uuid.get_least_significant_bits())
@@ -49,12 +53,20 @@ public:
    bool operator<(const counter_id& other) const {
        return to_uuid() < other.to_uuid();
    }
+    bool operator>(const counter_id& other) const {
+        return other.to_uuid() < to_uuid();
+    }
    bool operator==(const counter_id& other) const {
        return to_uuid() == other.to_uuid();
    }
    bool operator!=(const counter_id& other) const {
        return !(*this == other);
    }
+public:
+    // (Wrong) Counter ID ordering used by Scylla 1.7.4 and earlier.
+    struct less_compare_1_7_4 {
+        bool operator()(const counter_id& a, const counter_id& b) const;
+    };
 public:
    static counter_id local();

@@ -94,6 +106,14 @@ public:
    int64_t value() const { return read<int64_t>(offset::value); }
    int64_t logical_clock() const { return read<int64_t>(offset::logical_clock); }

+    bool operator==(const counter_shard_view& other) const {
+        return id() == other.id() && value() == other.value()
+               && logical_clock() == other.logical_clock();
+    }
+    bool operator!=(const counter_shard_view& other) const {
+        return !(*this == other);
+    }
+
    struct less_compare_by_id {
        bool operator()(const counter_shard_view& x, const counter_shard_view& y) const {
            return x.id() < y.id();
@@ -112,6 +132,18 @@ private:
    static void write(const T& value, bytes::iterator& out) {
        out = std::copy_n(reinterpret_cast<const char*>(&value), sizeof(T), out);
    }
+private:
+    // Shared logic for applying counter_shards and counter_shard_views.
+    // T is either counter_shard or basic_counter_shard_view<U>.
+    template<typename T>
+    counter_shard& do_apply(T&& other) noexcept {
+        auto other_clock = other.logical_clock();
+        if (_logical_clock < other_clock) {
+            _logical_clock = other_clock;
+            _value = other.value();
+        }
+        return *this;
+    }
 public:
    counter_shard(counter_id id, int64_t value, int64_t logical_clock) noexcept
        : _id(id)
@@ -136,12 +168,11 @@ public:
    }

    counter_shard& apply(counter_shard_view other) noexcept {
-        auto other_clock = other.logical_clock();
-        if (_logical_clock < other_clock) {
-            _logical_clock = other_clock;
-            _value = other.value();
-        }
-        return *this;
+        return do_apply(other);
+    }
+
+    counter_shard& apply(const counter_shard& other) noexcept {
+        return do_apply(other);
    }

    static size_t serialized_size() {
@@ -156,6 +187,9 @@ public:

 class counter_cell_builder {
    std::vector<counter_shard> _shards;
+    bool _sorted = true;
+private:
+    void do_sort_and_remove_duplicates();
 public:
    counter_cell_builder() = default;
    counter_cell_builder(size_t shard_count) {
@@ -166,6 +200,21 @@ public:
        _shards.emplace_back(cs);
    }

+    void add_maybe_unsorted_shard(const counter_shard& cs) {
+        add_shard(cs);
+        if (_sorted && _shards.size() > 1) {
+            auto current = _shards.rbegin();
+            auto previous = std::next(current);
+            _sorted = current->id() > previous->id();
+        }
+    }
+
+    void sort_and_remove_duplicates() {
+        if (!_sorted) {
+            do_sort_and_remove_duplicates();
+        }
+    }
+
    size_t serialized_size() const {
        return _shards.size() * counter_shard::serialized_size();
    }
@@ -287,6 +336,13 @@ public:
        return get_shard(counter_id::local());
    }

+    bool operator==(const counter_cell_view& other) const {
+        return timestamp() == other.timestamp() && boost::equal(shards(), other.shards());
+    }
+
+    // Returns counter shards in an order that is compatible with Scylla 1.7.4.
+    std::vector<counter_shard> shards_compatible_with_1_7_4() const;
+
    // Reversibly applies two counter cells, at least one of them must be live.
    // Returns true iff dst was modified.
    static bool apply_reversibly(atomic_cell_or_collection& dst, atomic_cell_or_collection& src);
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -1548,6 +1548,8 @@ basic_unreserved_keyword returns [sstring str]
        | K_DISTINCT
        | K_CONTAINS
        | K_STATIC
+        | K_FROZEN
+        | K_TUPLE
        | K_FUNCTION
        | K_AGGREGATE
        | K_SFUNC
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -67,6 +67,14 @@ functions::init() {
    declare(aggregate_fcts::make_max_function<int64_t>());
    declare(aggregate_fcts::make_min_function<int64_t>());

+    declare(aggregate_fcts::make_count_function<float>());
+    declare(aggregate_fcts::make_max_function<float>());
+    declare(aggregate_fcts::make_min_function<float>());
+
+    declare(aggregate_fcts::make_count_function<double>());
+    declare(aggregate_fcts::make_max_function<double>());
+    declare(aggregate_fcts::make_min_function<double>());
+
    //FIXME:
    //declare(aggregate_fcts::make_count_function<bytes>());
    //declare(aggregate_fcts::make_max_function<bytes>());
@@ -78,15 +86,17 @@ functions::init() {
    declare(make_blob_as_varchar_fct());
    declare(aggregate_fcts::make_sum_function<int32_t>());
    declare(aggregate_fcts::make_sum_function<int64_t>());
-    declare(aggregate_fcts::make_avg_function<int32_t>());
-    declare(aggregate_fcts::make_avg_function<int64_t>());
+    declare(aggregate_fcts::make_sum_function<float>());
+    declare(aggregate_fcts::make_sum_function<double>());
 #if 0
-    declare(AggregateFcts.sumFunctionForFloat);
-    declare(AggregateFcts.sumFunctionForDouble);
    declare(AggregateFcts.sumFunctionForDecimal);
    declare(AggregateFcts.sumFunctionForVarint);
-    declare(AggregateFcts.avgFunctionForFloat);
-    declare(AggregateFcts.avgFunctionForDouble);
+#endif
+    declare(aggregate_fcts::make_avg_function<int32_t>());
+    declare(aggregate_fcts::make_avg_function<int64_t>());
+    declare(aggregate_fcts::make_avg_function<float>());
+    declare(aggregate_fcts::make_avg_function<double>());
+#if 0
    declare(AggregateFcts.avgFunctionForVarint);
    declare(AggregateFcts.avgFunctionForDecimal);
 #endif
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -115,7 +115,7 @@ public:
                if (restriction->is_slice()) {
                    throw exceptions::invalid_request_exception(sprint(
                        "PRIMARY KEY column \"%s\" cannot be restricted (preceding column \"%s\" is restricted by a non-EQ relation)",
-                        _restrictions->next_column(new_column)->name_as_text(), new_column.name_as_text()));
+                        last_column.name_as_text(), new_column.name_as_text()));
                }
            }

--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -43,6 +43,7 @@
 #include "schema_builder.hh"
 #include "service/migration_manager.hh"
 #include "boost/range/adaptor/map.hpp"
+#include "stdx.hh"

 namespace cql3 {

@@ -86,14 +87,14 @@ const sstring& alter_type_statement::keyspace() const
    return _name.get_keyspace();
 }

-static int32_t get_idx_of_field(user_type type, shared_ptr<column_identifier> field)
+static stdx::optional<uint32_t> get_idx_of_field(user_type type, shared_ptr<column_identifier> field)
 {
    for (uint32_t i = 0; i < type->field_names().size(); ++i) {
        if (field->name() == type->field_names()[i]) {
-            return i;
+            return {i};
        }
    }
-    return -1;
+    return {};
 }

 void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, bool is_local_only)
@@ -168,7 +169,7 @@ alter_type_statement::add_or_alter::add_or_alter(const ut_name& name, bool is_ad

 user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
 {
-    if (get_idx_of_field(to_update, _field_name) >= 0) {
+    if (get_idx_of_field(to_update, _field_name)) {
        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->name(), _name.to_string()));
    }

@@ -185,19 +186,19 @@ user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_

 user_type alter_type_statement::add_or_alter::do_alter(database& db, user_type to_update) const
 {
-    uint32_t idx = get_idx_of_field(to_update, _field_name);
-    if (idx < 0) {
+    stdx::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
+    if (!idx) {
        throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->name(), _name.to_string()));
    }

-    auto previous = to_update->field_types()[idx];
+    auto previous = to_update->field_types()[*idx];
    auto new_type = _field_type->prepare(db, keyspace())->get_type();
    if (!new_type->is_compatible_with(*previous)) {
        throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->name(), _name.to_string()));
    }

    std::vector<data_type> new_types(to_update->field_types());
-    new_types[idx] = new_type;
+    new_types[*idx] = new_type;
    return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, to_update->field_names(), std::move(new_types));
 }

@@ -221,11 +222,11 @@ user_type alter_type_statement::renames::make_updated_type(database& db, user_ty
    std::vector<bytes> new_names(to_update->field_names());
    for (auto&& rename : _renames) {
        auto&& from = rename.first;
-        int32_t idx = get_idx_of_field(to_update, from);
-        if (idx < 0) {
+        stdx::optional<uint32_t> idx = get_idx_of_field(to_update, from);
+        if (!idx) {
            throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", from->to_string(), _name.to_string()));
        }
-        new_names[idx] = rename.second->name();
+        new_names[*idx] = rename.second->name();
    }
    auto&& updated = user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), to_update->field_types());
    create_type_statement::check_for_duplicate_names(updated);
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -381,8 +381,18 @@ shared_ptr<prepared_statement>
 batch_statement::prepare(database& db, cql_stats& stats) {
    auto&& bound_names = get_bound_variables();

+    stdx::optional<sstring> first_ks;
+    stdx::optional<sstring> first_cf;
+    bool have_multiple_cfs = false;
+
    std::vector<shared_ptr<cql3::statements::modification_statement>> statements;
    for (auto&& parsed : _parsed_statements) {
+        if (!first_ks) {
+            first_ks = parsed->keyspace();
+            first_cf = parsed->column_family();
+        } else {
+            have_multiple_cfs = first_ks.value() != parsed->keyspace() || first_cf.value() != parsed->column_family();
+        }
        statements.push_back(parsed->prepare(db, bound_names, stats));
    }

@@ -392,8 +402,13 @@ batch_statement::prepare(database& db, cql_stats& stats) {
    cql3::statements::batch_statement batch_statement_(bound_names->size(), _type, std::move(statements), std::move(prep_attrs), stats);
    batch_statement_.validate();

+    std::vector<uint16_t> partition_key_bind_indices;
+    if (!have_multiple_cfs && batch_statement_.get_statements().size() > 0) {
+        partition_key_bind_indices = bound_names->get_partition_key_bind_indexes(batch_statement_.get_statements()[0]->s);
+    }
    return ::make_shared<prepared>(make_shared(std::move(batch_statement_)),
-                                                     bound_names->get_specifications());
+                                                     bound_names->get_specifications(),
+                                                     std::move(partition_key_bind_indices));
 }

 }
--- a/cql3/statements/drop_type_statement.cc
+++ b/cql3/statements/drop_type_statement.cc
@@ -79,6 +79,57 @@ void drop_type_statement::validate(distributed<service::storage_proxy>& proxy, c
                throw exceptions::invalid_request_exception(sprint("No user type named %s exists.", _name.to_string()));
            }
        }
+
+        // We don't want to drop a type unless it's not used anymore (mainly because
+        // if someone drops a type and recreates one with the same name but different
+        // definition with the previous name still in use, things can get messy).
+        // We have two places to check: 1) other user type that can nest the one
+        // we drop and 2) existing tables referencing the type (maybe in a nested
+        // way).
+
+        // This code is moved from schema_keyspace (akin to origin) because we cannot
+        // delay this check to until after we've applied the mutations. If a type or
+        // table references the type we're dropping, we will a.) get exceptions parsing
+        // (can be translated to invalid_request, but...) and more importantly b.)
+        // we will leave those types/tables in a broken state.
+        // We managed to get through this before because we neither enforced hard
+        // cross reference between types when loading them, nor did we in fact
+        // probably ever run the scenario of dropping a referenced type and then
+        // actually using the referee.
+        //
+        // Now, this has a giant flaw. We are succeptible to race conditions here,
+        // since we could have a drop at the same time as a create type that references
+        // the dropped one, but we complete the check before the create is done,
+        // yet apply the drop mutations after -> inconsistent data!
+        // This problem is the same in origin, and I see no good way around it
+        // as long as the atomicity of schema modifications are based on
+        // actual appy of mutations, because unlike other drops, this one isn't
+        // benevolent.
+        // I guess this is one case where user need beware, and don't mess with types
+        // concurrently!
+
+        auto&& type = old->second;
+        auto&& keyspace = type->_keyspace;
+        auto&& name = type->_name;
+
+        for (auto&& ut : all_types | boost::adaptors::map_values) {
+            if (ut->_keyspace == keyspace && ut->_name == name) {
+                continue;
+            }
+
+            if (ut->references_user_type(keyspace, name)) {
+                throw exceptions::invalid_request_exception(sprint("Cannot drop user type %s.%s as it is still used by user type %s", keyspace, type->get_name_as_string(), ut->get_name_as_string()));
+            }
+        }
+
+        for (auto&& cfm : ks.metadata()->cf_meta_data() | boost::adaptors::map_values) {
+            for (auto&& col : cfm->all_columns()) {
+                if (col.second->type->references_user_type(keyspace, name)) {
+                    throw exceptions::invalid_request_exception(sprint("Cannot drop user type %s.%s as it is still used by table %s.%s", keyspace, type->get_name_as_string(), cfm->ks_name(), cfm->cf_name()));
+                }
+            }
+        }
+
    } catch (no_such_keyspace& e) {
        throw exceptions::invalid_request_exception(sprint("Cannot drop type in unknown keyspace %s", keyspace()));
    }
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -597,9 +597,11 @@ namespace raw {

 ::shared_ptr<prepared_statement>
 modification_statement::modification_statement::prepare(database& db, cql_stats& stats) {
+    schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());
    auto bound_names = get_bound_variables();
    auto statement = prepare(db, bound_names, stats);
-    return ::make_shared<prepared>(std::move(statement), *bound_names);
+    auto partition_key_bind_indices = bound_names->get_partition_key_bind_indexes(schema);
+    return ::make_shared<prepared>(std::move(statement), *bound_names, std::move(partition_key_bind_indices));
 }

 ::shared_ptr<cql3::statements::modification_statement>
--- a/cql3/statements/parsed_statement.cc
+++ b/cql3/statements/parsed_statement.cc
@@ -67,21 +67,22 @@ bool parsed_statement::uses_function(const sstring& ks_name, const sstring& func

 }

-prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_)
+prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_, std::vector<uint16_t> partition_key_bind_indices)
    : statement(std::move(statement_))
    , bound_names(std::move(bound_names_))
+    , partition_key_bind_indices(std::move(partition_key_bind_indices))
 { }

-prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, const variable_specifications& names)
-    : prepared_statement(statement_, names.get_specifications())
+prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, const variable_specifications& names, const std::vector<uint16_t>& partition_key_bind_indices)
+    : prepared_statement(statement_, names.get_specifications(), partition_key_bind_indices)
 { }

-prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, variable_specifications&& names)
-    : prepared_statement(statement_, std::move(names).get_specifications())
+prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, variable_specifications&& names, std::vector<uint16_t>&& partition_key_bind_indices)
+    : prepared_statement(statement_, std::move(names).get_specifications(), std::move(partition_key_bind_indices))
 { }

 prepared_statement::prepared_statement(::shared_ptr<cql_statement>&& statement_)
-    : prepared_statement(statement_, std::vector<::shared_ptr<column_specification>>())
+    : prepared_statement(statement_, std::vector<::shared_ptr<column_specification>>(), std::vector<uint16_t>())
 { }

 }
--- a/cql3/statements/prepared_statement.hh
+++ b/cql3/statements/prepared_statement.hh
@@ -60,12 +60,13 @@ public:
    sstring raw_cql_statement;
    const ::shared_ptr<cql_statement> statement;
    const std::vector<::shared_ptr<column_specification>> bound_names;
+    std::vector<uint16_t> partition_key_bind_indices;

-    prepared_statement(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_);
+    prepared_statement(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_, std::vector<uint16_t> partition_key_bind_indices);

-    prepared_statement(::shared_ptr<cql_statement> statement_, const variable_specifications& names);
+    prepared_statement(::shared_ptr<cql_statement> statement_, const variable_specifications& names, const std::vector<uint16_t>& partition_key_bind_indices);

-    prepared_statement(::shared_ptr<cql_statement> statement_, variable_specifications&& names);
+    prepared_statement(::shared_ptr<cql_statement> statement_, variable_specifications&& names, std::vector<uint16_t>&& partition_key_bind_indices);

    prepared_statement(::shared_ptr<cql_statement>&& statement_);
 };
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -445,7 +445,9 @@ select_statement::select_statement(::shared_ptr<cf_name> cf_name,
        prepare_limit(db, bound_names),
        stats);

-    return ::make_shared<prepared>(std::move(stmt), std::move(*bound_names));
+    auto partition_key_bind_indices = bound_names->get_partition_key_bind_indexes(schema);
+
+    return ::make_shared<prepared>(std::move(stmt), std::move(*bound_names), std::move(partition_key_bind_indices));
 }

 ::shared_ptr<restrictions::statement_restrictions>
--- a/cql3/variable_specifications.cc
+++ b/cql3/variable_specifications.cc
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cql3/variable_specifications.hh"
+
+namespace cql3 {
+
+variable_specifications::variable_specifications(const std::vector<::shared_ptr<column_identifier>>& variable_names)
+    : _variable_names{variable_names}
+    , _specs{variable_names.size()}
+    , _target_columns{variable_names.size()}
+{ }
+
+::shared_ptr<variable_specifications> variable_specifications::empty() {
+    return ::make_shared<variable_specifications>(std::vector<::shared_ptr<column_identifier>>{});
+}
+
+size_t variable_specifications::size() const {
+    return _variable_names.size();
+}
+
+std::vector<::shared_ptr<column_specification>> variable_specifications::get_specifications() const & {
+    return std::vector<::shared_ptr<column_specification>>(_specs.begin(), _specs.end());
+}
+
+std::vector<::shared_ptr<column_specification>> variable_specifications::get_specifications() && {
+    return std::move(_specs);
+}
+
+std::vector<uint16_t> variable_specifications::get_partition_key_bind_indexes(schema_ptr schema) const {
+    auto count = schema->partition_key_columns().size();
+    std::vector<uint16_t> partition_key_positions(count, uint16_t(0));
+    std::vector<bool> set(count, false);
+    for (size_t i = 0; i < _target_columns.size(); i++) {
+        auto& target_column = _target_columns[i];
+        const auto* cdef = schema->get_column_definition(target_column->name->name());
+        if (cdef && cdef->is_partition_key()) {
+            partition_key_positions[cdef->position()] = i;
+            set[cdef->position()] = true;
+        }
+    }
+    for (bool b : set) {
+        if (!b) {
+            return {};
+        }
+    }
+    return partition_key_positions;
+}
+
+void variable_specifications::add(int32_t bind_index, ::shared_ptr<column_specification> spec) {
+    _target_columns[bind_index] = spec;
+    auto name = _variable_names[bind_index];
+    // Use the user name, if there is one
+    if (name) {
+        spec = ::make_shared<column_specification>(spec->ks_name, spec->cf_name, name, spec->type);
+    }
+    _specs[bind_index] = spec;
+}
+
+}
--- a/cql3/variable_specifications.hh
+++ b/cql3/variable_specifications.hh
@@ -53,41 +53,26 @@ class variable_specifications final {
 private:
    std::vector<shared_ptr<column_identifier>> _variable_names;
    std::vector<::shared_ptr<column_specification>> _specs;
+    std::vector<::shared_ptr<column_specification>> _target_columns;

 public:
-    variable_specifications(const std::vector<::shared_ptr<column_identifier>>& variable_names)
-        : _variable_names{variable_names}
-        , _specs{variable_names.size()}
-    { }
+    variable_specifications(const std::vector<::shared_ptr<column_identifier>>& variable_names);

    /**
     * Returns an empty instance of <code>VariableSpecifications</code>.
     * @return an empty instance of <code>VariableSpecifications</code>
     */
-    static ::shared_ptr<variable_specifications> empty() {
-        return ::make_shared<variable_specifications>(std::vector<::shared_ptr<column_identifier>>{});
-    }
+    static ::shared_ptr<variable_specifications> empty();

-    size_t size() const {
-        return _variable_names.size();
-    }
+    size_t size() const;

-    std::vector<::shared_ptr<column_specification>> get_specifications() const & {
-        return std::vector<::shared_ptr<column_specification>>(_specs.begin(), _specs.end());
-    }
+    std::vector<::shared_ptr<column_specification>> get_specifications() const &;

-    std::vector<::shared_ptr<column_specification>> get_specifications() && {
-        return std::move(_specs);
-    }
+    std::vector<::shared_ptr<column_specification>> get_specifications() &&;

-    void add(int32_t bind_index, ::shared_ptr<column_specification> spec) {
-        auto name = _variable_names[bind_index];
-        // Use the user name, if there is one
-        if (name) {
-            spec = ::make_shared<column_specification>(spec->ks_name, spec->cf_name, name, spec->type);
-        }
-        _specs[bind_index] = spec;
-    }
+    std::vector<uint16_t> get_partition_key_bind_indexes(schema_ptr schema) const;
+
+    void add(int32_t bind_index, ::shared_ptr<column_specification> spec);
 };

 }
--- a/database.cc
+++ b/database.cc
@@ -165,8 +165,9 @@ column_family::sstables_as_mutation_source() {
                                   const dht::partition_range& r,
                                   const query::partition_slice& slice,
                                   const io_priority_class& pc,
-                                   tracing::trace_state_ptr trace_state) {
-        return make_sstable_reader(std::move(s), r, slice, pc, std::move(trace_state));
+                                   tracing::trace_state_ptr trace_state,
+                                   mutation_reader::forwarding fwd_mr) {
+        return make_sstable_reader(std::move(s), r, slice, pc, std::move(trace_state), fwd_mr);
    });
 }

@@ -364,12 +365,13 @@ class range_sstable_reader final : public combined_mutation_reader {
    const io_priority_class& _pc;
    tracing::trace_state_ptr _trace_state;
    const query::partition_slice& _slice;
+    mutation_reader::forwarding _fwd_mr;
 private:
    std::unique_ptr<mutation_reader> create_reader(sstables::shared_sstable sst) {
        tracing::trace(_trace_state, "Reading partition range {} from sstable {}", *_pr, seastar::value_of([&sst] { return sst->get_filename(); }));
        // FIXME: make sstable::read_range_rows() return ::mutation_reader so that we can drop this wrapper.
        mutation_reader reader =
-            make_mutation_reader<sstable_range_wrapping_reader>(sst, _s, *_pr, _slice, _pc);
+            make_mutation_reader<sstable_range_wrapping_reader>(sst, _s, *_pr, _slice, _pc, _fwd_mr);
        if (sst->is_shared()) {
            reader = make_filtering_reader(std::move(reader), belongs_to_current_shard);
        }
@@ -381,13 +383,15 @@ public:
                         const dht::partition_range& pr,
                         const query::partition_slice& slice,
                         const io_priority_class& pc,
-                         tracing::trace_state_ptr trace_state)
+                         tracing::trace_state_ptr trace_state,
+                         mutation_reader::forwarding fwd_mr)
        : _s(s)
        , _pr(&pr)
        , _sstables(std::move(sstables))
        , _pc(pc)
        , _trace_state(std::move(trace_state))
        , _slice(slice)
+        , _fwd_mr(fwd_mr)
    {
        auto ssts = _sstables->select(pr);
        std::vector<mutation_reader*> readers;
@@ -506,7 +510,8 @@ column_family::make_sstable_reader(schema_ptr s,
                                   const dht::partition_range& pr,
                                   const query::partition_slice& slice,
                                   const io_priority_class& pc,
-                                   tracing::trace_state_ptr trace_state) const {
+                                   tracing::trace_state_ptr trace_state,
+                                   mutation_reader::forwarding fwd_mr) const {
    // restricts a reader's concurrency if the configuration specifies it
    auto restrict_reader = [&] (mutation_reader&& in) {
        auto&& config = [this, &pc] () -> const restricted_mutation_reader_config& {
@@ -522,6 +527,10 @@ column_family::make_sstable_reader(schema_ptr s,
        }
    };

+    // CAVEAT: if make_sstable_reader() is called on a single partition
+    // we want to optimize and read exactly this partition. As a
+    // consequence, fast_forward_to() will *NOT* work on the result,
+    // regardless of what the fwd_mr parameter says.
    if (pr.is_singular() && pr.start()->value().has_key()) {
        const dht::ring_position& pos = pr.start()->value();
        if (dht::shard_of(pos.token()) != engine().cpu_id()) {
@@ -531,7 +540,7 @@ column_family::make_sstable_reader(schema_ptr s,
            _stats.estimated_sstable_per_read, *pos.key(), slice, pc, std::move(trace_state)));
    } else {
        // range_sstable_reader is not movable so we need to wrap it
-        return restrict_reader(make_mutation_reader<range_sstable_reader>(std::move(s), _sstables, pr, slice, pc, std::move(trace_state)));
+        return restrict_reader(make_mutation_reader<range_sstable_reader>(std::move(s), _sstables, pr, slice, pc, std::move(trace_state), fwd_mr));
    }
 }

@@ -578,7 +587,8 @@ column_family::make_reader(schema_ptr s,
                           const dht::partition_range& range,
                           const query::partition_slice& slice,
                           const io_priority_class& pc,
-                           tracing::trace_state_ptr trace_state) const {
+                           tracing::trace_state_ptr trace_state,
+                           mutation_reader::forwarding fwd_mr) const {
    if (_virtual_reader) {
        return _virtual_reader(s, range, slice, pc, trace_state);
    }
@@ -607,13 +617,13 @@ column_family::make_reader(schema_ptr s,
    // https://github.com/scylladb/scylla/issues/185

    for (auto&& mt : *_memtables) {
-        readers.emplace_back(mt->make_reader(s, range, slice, pc));
+        readers.emplace_back(mt->make_reader(s, range, slice, pc, trace_state, fwd_mr));
    }

    if (_config.enable_cache) {
-        readers.emplace_back(_cache.make_reader(s, range, slice, pc, std::move(trace_state)));
+        readers.emplace_back(_cache.make_reader(s, range, slice, pc, std::move(trace_state), fwd_mr));
    } else {
-        readers.emplace_back(make_sstable_reader(s, range, slice, pc, std::move(trace_state)));
+        readers.emplace_back(make_sstable_reader(s, range, slice, pc, std::move(trace_state), fwd_mr));
    }

    return make_combined_reader(std::move(readers));
@@ -629,10 +639,10 @@ column_family::make_streaming_reader(schema_ptr s,
    readers.reserve(_memtables->size() + 1);

    for (auto&& mt : *_memtables) {
-        readers.emplace_back(mt->make_reader(s, range, slice, pc));
+        readers.emplace_back(mt->make_reader(s, range, slice, pc, nullptr, mutation_reader::forwarding::no));
    }

-    readers.emplace_back(make_sstable_reader(s, range, slice, pc, nullptr));
+    readers.emplace_back(make_sstable_reader(s, range, slice, pc, nullptr, mutation_reader::forwarding::no));

    return make_combined_reader(std::move(readers));
 }
@@ -644,17 +654,17 @@ column_family::make_streaming_reader(schema_ptr s,
    auto& pc = service::get_local_streaming_read_priority();

    auto source = mutation_source([this] (schema_ptr s, const dht::partition_range& range, const query::partition_slice& slice,
-                                      const io_priority_class& pc, tracing::trace_state_ptr trace_state) {
+                                      const io_priority_class& pc, tracing::trace_state_ptr trace_state, mutation_reader::forwarding fwd_mr) {
        std::vector<mutation_reader> readers;
        readers.reserve(_memtables->size() + 1);
        for (auto&& mt : *_memtables) {
-            readers.emplace_back(mt->make_reader(s, range, slice, pc));
+            readers.emplace_back(mt->make_reader(s, range, slice, pc, trace_state, fwd_mr));
        }
-        readers.emplace_back(make_sstable_reader(s, range, slice, pc, std::move(trace_state)));
+        readers.emplace_back(make_sstable_reader(s, range, slice, pc, std::move(trace_state), fwd_mr));
        return make_combined_reader(std::move(readers));
    });

-    return make_multi_range_reader(s, std::move(source), ranges, slice, pc, nullptr);
+    return make_multi_range_reader(s, std::move(source), ranges, slice, pc, nullptr, mutation_reader::forwarding::no);
 }

 future<std::vector<locked_cell>> column_family::lock_counter_cells(const mutation& m) {
@@ -939,7 +949,8 @@ column_family::seal_active_streaming_memtable_immediate() {
            }).then([this, old, newtab] () {
                add_sstable(newtab, {engine().cpu_id()});
                trigger_compaction();
-            }).handle_exception([] (auto ep) {
+            }).handle_exception([newtab] (auto ep) {
+                newtab->mark_for_deletion();
                dblog.error("failed to write streamed sstable: {}", ep);
                return make_exception_future<>(ep);
            });
@@ -977,7 +988,8 @@ future<> column_family::seal_active_streaming_memtable_big(streaming_memtable_bi
                auto&& priority = service::get_local_streaming_write_priority();
                return newtab->write_components(*old, incremental_backups_enabled(), priority, true).then([this, newtab, old, &smb] {
                    smb.sstables.emplace_back(newtab);
-                }).handle_exception([] (auto ep) {
+                }).handle_exception([newtab] (auto ep) {
+                    newtab->mark_for_deletion();
                    dblog.error("failed to write streamed sstable: {}", ep);
                    return make_exception_future<>(ep);
                });
@@ -1082,6 +1094,7 @@ column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
                return make_ready_future<stop_iteration>(stop_iteration::yes);
            });
        } catch (...) {
+            newtab->mark_for_deletion();
            dblog.error("failed to write sstable {}: {}", newtab->get_filename(), std::current_exception());
            // If we failed this write we will try the write again and that will create a new flush reader
            // that will decrease dirty memory again. So we need to reset the accounting.
@@ -1250,7 +1263,7 @@ void column_family::rebuild_statistics() {
                    // making the two ranges compatible when compiling with boost 1.55.
                    // Noone is actually moving anything...
                                         std::move(*_sstables->all()))) {
-        update_stats_for_new_sstable(tab->data_size(), tab->get_shards_for_this_sstable());
+        update_stats_for_new_sstable(tab->bytes_on_disk(), tab->get_shards_for_this_sstable());
    }
 }

@@ -1357,7 +1370,7 @@ column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool
 }

 static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,
-                   const lw_shared_ptr<dht::token_range_vector>& owned_ranges,
+                   const dht::token_range_vector& owned_ranges,
                   schema_ptr s) {
    auto first = sst->get_first_partition_key();
    auto last = sst->get_last_partition_key();
@@ -1366,7 +1379,7 @@ static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,
    dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);

    // return true iff sst partition range isn't fully contained in any of the owned ranges.
-    for (auto& r : *owned_ranges) {
+    for (auto& r : owned_ranges) {
        if (r.contains(sst_token_range, dht::token_comparator())) {
            return false;
        }
@@ -1376,17 +1389,24 @@ static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,

 future<> column_family::cleanup_sstables(sstables::compaction_descriptor descriptor) {
    dht::token_range_vector r = service::get_local_storage_service().get_local_ranges(_schema->ks_name());
-    auto owned_ranges = make_lw_shared<dht::token_range_vector>(std::move(r));
-    auto sstables_to_cleanup = make_lw_shared<std::vector<sstables::shared_sstable>>(std::move(descriptor.sstables));

-    return parallel_for_each(*sstables_to_cleanup, [this, owned_ranges = std::move(owned_ranges), sstables_to_cleanup] (auto& sst) {
-        if (!owned_ranges->empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
+  return do_with(std::move(descriptor.sstables), std::move(r), [this] (auto& sstables, auto& owned_ranges) {
+    return do_for_each(sstables, [this, &owned_ranges] (auto& sst) {
+        if (!owned_ranges.empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
           return make_ready_future<>();
        }

-        std::vector<sstables::shared_sstable> sstable_to_compact({ sst });
-        return this->compact_sstables(sstables::compaction_descriptor(std::move(sstable_to_compact), sst->get_sstable_level()), true);
+        // this semaphore ensures that only one cleanup will run per shard.
+        // That's to prevent node from running out of space when almost all sstables
+        // need cleanup, so if sstables are cleaned in parallel, we may need almost
+        // twice the disk space used by those sstables.
+        static thread_local semaphore sem(1);
+
+        return with_semaphore(sem, 1, [this, &sst] {
+            return this->compact_sstables(sstables::compaction_descriptor({ sst }, sst->get_sstable_level()), true);
+        });
    });
+  });
 }

 // FIXME: this is just an example, should be changed to something more general
@@ -1525,16 +1545,19 @@ future<> distributed_loader::open_sstable(distributed<database>& db, sstables::e

    return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
            [&db, comps = std::move(comps), func = std::move(func)] (database& local) {
-        auto& cf = local.find_column_family(comps.ks, comps.cf);

-        auto f = sstables::sstable::load_shared_components(cf.schema(), cf._config.datadir, comps.generation, comps.version, comps.format);
-        return f.then([&db, comps = std::move(comps), func = std::move(func)] (sstables::sstable_open_info info) {
-            // shared components loaded, now opening sstable in all shards with shared components
-            return do_with(std::move(info), [&db, comps = std::move(comps), func = std::move(func)] (auto& info) {
-                return invoke_all_with_ptr(db, std::move(info.components),
-                        [owners = info.owners, data = info.data.dup(), index = info.index.dup(), comps, func] (database& db, auto components) {
-                    auto& cf = db.find_column_family(comps.ks, comps.cf);
-                    return func(cf, sstables::foreign_sstable_open_info{std::move(components), owners, data, index});
+        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func)] {
+            auto& cf = local.find_column_family(comps.ks, comps.cf);
+
+            auto f = sstables::sstable::load_shared_components(cf.schema(), cf._config.datadir, comps.generation, comps.version, comps.format);
+            return f.then([&db, comps = std::move(comps), func = std::move(func)] (sstables::sstable_open_info info) {
+                // shared components loaded, now opening sstable in all shards with shared components
+                return do_with(std::move(info), [&db, comps = std::move(comps), func = std::move(func)] (auto& info) {
+                    return invoke_all_with_ptr(db, std::move(info.components),
+                            [owners = info.owners, data = info.data.dup(), index = info.index.dup(), comps, func] (database& db, auto components) {
+                        auto& cf = db.find_column_family(comps.ks, comps.cf);
+                        return func(cf, sstables::foreign_sstable_open_info{std::move(components), owners, data, index});
+                    });
                });
            });
        });
@@ -1706,7 +1729,7 @@ future<> distributed_loader::populate_column_family(distributed<database>& db, s
                return make_ready_future<>();
            });
        }).then([verifier, sstdir, descriptor, ks = std::move(ks), cf = std::move(cf)] {
-            return parallel_for_each(*verifier, [sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), descriptor] (auto v) {
+            return do_for_each(*verifier, [sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), descriptor, verifier] (auto v) {
                if (v.second == status::has_temporary_toc_file) {
                    unsigned long gen = v.first;
                    assert(descriptor->version);
@@ -1745,9 +1768,9 @@ database::database(const db::config& cfg)
    : _stats(make_lw_shared<db_stats>())
    , _cfg(std::make_unique<db::config>(cfg))
    // Allow system tables a pool of 10 MB memory to write, but never block on other regions.
-    , _system_dirty_memory_manager(*this, 10 << 20)
-    , _dirty_memory_manager(*this, memory::stats().total_memory() * 0.45)
-    , _streaming_dirty_memory_manager(*this, memory::stats().total_memory() * 0.10)
+    , _system_dirty_memory_manager(*this, 10 << 20, cfg.virtual_dirty_soft_limit())
+    , _dirty_memory_manager(*this, memory::stats().total_memory() * 0.45, cfg.virtual_dirty_soft_limit())
+    , _streaming_dirty_memory_manager(*this, memory::stats().total_memory() * 0.10, cfg.virtual_dirty_soft_limit())
    , _version(empty_version)
    , _enable_incremental_backups(cfg.incremental_backups())
 {
@@ -1802,7 +1825,7 @@ database::setup_metrics() {
    });

    _metrics.add_group("database", {
-        sm::make_gauge("requests_blocked_memory", [this] { return _dirty_memory_manager.region_group().blocked_requests(); },
+        sm::make_gauge("requests_blocked_memory_current", [this] { return _dirty_memory_manager.region_group().blocked_requests(); },
                       sm::description(
                           seastar::format("Holds the current number of requests blocked due to reaching the memory quota ({}B). "
                                           "Non-zero value indicates that our bottleneck is memory and more specifically - the memory quota allocated for the \"database\" component.", _dirty_memory_manager.throttle_threshold()))),
@@ -2535,8 +2558,10 @@ column_family::as_mutation_source(tracing::trace_state_ptr trace_state) const {
    return mutation_source([this, trace_state = std::move(trace_state)] (schema_ptr s,
                                   const dht::partition_range& range,
                                   const query::partition_slice& slice,
-                                   const io_priority_class& pc) {
-        return this->make_reader(std::move(s), range, slice, pc, std::move(trace_state));
+                                   const io_priority_class& pc,
+                                   tracing::trace_state_ptr trace_state,
+                                   mutation_reader::forwarding fwd_mr) {
+        return this->make_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd_mr);
    });
 }

@@ -2663,7 +2688,7 @@ column_family::apply(const frozen_mutation& m, const schema_ptr& m_schema, const
    do_apply(m, m_schema, rp);
 }

-future<frozen_mutation> database::do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema) {
+future<frozen_mutation> database::do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema, timeout_clock::time_point timeout) {
    auto m = fm.unfreeze(m_schema);
    m.upgrade(cf.schema());

@@ -2689,9 +2714,9 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
        cql_serialization_format::internal(), query::max_rows);

    return do_with(std::move(slice), std::move(m), std::vector<locked_cell>(), stdx::optional<frozen_mutation>(),
-                   [this, &cf] (const query::partition_slice& slice, mutation& m, std::vector<locked_cell>& locks,
+                   [this, &cf, timeout] (const query::partition_slice& slice, mutation& m, std::vector<locked_cell>& locks,
                               stdx::optional<frozen_mutation>& fm) mutable {
-        return cf.lock_counter_cells(m).then([&, m_schema = cf.schema(), this] (std::vector<locked_cell> lcs) {
+        return cf.lock_counter_cells(m).then([&, timeout, m_schema = cf.schema(), this] (std::vector<locked_cell> lcs) {
            locks = std::move(lcs);

            // Before counter update is applied it needs to be transformed from
@@ -2702,7 +2727,7 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
            return mutation_query(m_schema, cf.as_mutation_source({}),
                                  dht::partition_range::make_singular(m.decorated_key()),
                                  slice, query::max_rows, query::max_partitions,
-                                  gc_clock::now(), { }).then([this, &cf, &m, &fm, m_schema] (auto result) {
+                                  gc_clock::now(), { }).then([this, timeout, &cf, &m, &fm, m_schema] (auto result) {

                // ...now, that we got existing state of all affected counter
                // cells we can look for our shard in each of them, increment
@@ -2714,9 +2739,8 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
                transform_counter_updates_to_shards(m, mopt ? &*mopt : nullptr, cf.failed_counter_applies_to_memtable());

                // FIXME: oh dear, another freeze
-                // FIXME: timeout
                fm = freeze(m);
-                return this->do_apply(m_schema, *fm, { });
+                return this->do_apply(m_schema, *fm, timeout);
            }).then([&fm] {
                return std::move(*fm);
            });
@@ -2854,7 +2878,7 @@ future<> dirty_memory_manager::flush_when_needed() {
    });
 }

-void dirty_memory_manager::start_reclaiming() {
+void dirty_memory_manager::start_reclaiming() noexcept {
    _should_flush.signal();
 }

@@ -2876,7 +2900,7 @@ future<frozen_mutation> database::apply_counter_update(schema_ptr s, const froze
    }
    try {
        auto& cf = find_column_family(m.column_family_id());
-        return do_apply_counter_update(cf, m, s);
+        return do_apply_counter_update(cf, m, s, timeout);
    } catch (no_such_column_family&) {
        dblog.error("Attempting to mutate non-existent table {}", m.column_family_id());
        throw;
@@ -3103,6 +3127,10 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
                }
                return f.then([&cf, truncated_at] {
                    return cf.discard_sstables(truncated_at).then([&cf, truncated_at](db::replay_position rp) {
+                        // TODO: verify that rp == db::replay_position is because we have no sstables (and no data flushed)
+                        if (rp == db::replay_position()) {
+                            return make_ready_future();
+                        }
                        // TODO: indexes.
                        return db::system_keyspace::save_truncation_record(cf, truncated_at, rp);
                    });
--- a/database.hh
+++ b/database.hh
@@ -149,7 +149,7 @@ class dirty_memory_manager: public logalloc::region_group_reclaimer {
    std::unordered_map<const logalloc::region*, flush_permit> _flush_manager;

    future<> _waiting_flush;
-    virtual void start_reclaiming() override;
+    virtual void start_reclaiming() noexcept override;

    bool has_pressure() const {
        return over_soft_limit();
@@ -193,8 +193,8 @@ public:
    //
    // We then set the soft limit to 80 % of the virtual dirty hard limit, which is equal to 40 % of
    // the user-supplied threshold.
-    dirty_memory_manager(database& db, size_t threshold)
-        : logalloc::region_group_reclaimer(threshold / 2, threshold * 0.40)
+    dirty_memory_manager(database& db, size_t threshold, double soft_limit)
+        : logalloc::region_group_reclaimer(threshold / 2, threshold * soft_limit / 2)
        , _db(&db)
        , _region_group(*this)
        , _flush_serializer(1)
@@ -582,7 +582,8 @@ private:
                                        const dht::partition_range& range,
                                        const query::partition_slice& slice,
                                        const io_priority_class& pc,
-                                        tracing::trace_state_ptr trace_state) const;
+                                        tracing::trace_state_ptr trace_state,
+                                        mutation_reader::forwarding fwd_mr) const;

    mutation_source sstables_as_mutation_source();
    partition_presence_checker make_partition_presence_checker(lw_shared_ptr<sstables::sstable_set>);
@@ -624,7 +625,8 @@ public:
            const dht::partition_range& range = query::full_partition_range,
            const query::partition_slice& slice = query::full_slice,
            const io_priority_class& pc = default_priority_class(),
-            tracing::trace_state_ptr trace_state = nullptr) const;
+            tracing::trace_state_ptr trace_state = nullptr,
+            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;

    // The streaming mutation reader differs from the regular mutation reader in that:
    //  - Reflects all writes accepted by replica prior to creation of the
@@ -1076,6 +1078,7 @@ private:
    ::cf_stats _cf_stats;
    static constexpr size_t max_concurrent_reads() { return 100; }
    static constexpr size_t max_system_concurrent_reads() { return 10; }
+    static constexpr size_t max_concurrent_sstable_loads() { return 3; }
    struct db_stats {
        uint64_t total_writes = 0;
        uint64_t total_writes_failed = 0;
@@ -1101,6 +1104,8 @@ private:
    semaphore _system_read_concurrency_sem{max_system_concurrent_reads()};
    restricted_mutation_reader_config _system_read_concurrency_config;

+    semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};
+
    std::unordered_map<sstring, keyspace> _keyspaces;
    std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
    std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash> _ks_cf_to_uuid;
@@ -1126,7 +1131,7 @@ private:

    query::result_memory_limiter _result_memory_limiter;

-    future<frozen_mutation> do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema);
+    future<frozen_mutation> do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema, timeout_clock::time_point timeout);
 public:
    static utils::UUID empty_version;

@@ -1257,6 +1262,9 @@ public:
    semaphore& system_keyspace_read_concurrency_sem() {
        return _system_read_concurrency_sem;
    }
+    semaphore& sstable_load_concurrency_sem() {
+        return _sstable_load_concurrency_sem;
+    }

    friend class distributed_loader;
 };
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -492,6 +492,7 @@ public:
        if (shutdown) {
            auto me = shared_from_this();
            return _gate.close().then([me] {
+                me->_closed = true;
                return me->sync().finally([me] {
                    // When we get here, nothing should add ops,
                    // and we should have waited out all pending.
@@ -1281,6 +1282,7 @@ future<> db::commitlog::segment_manager::shutdown() {
                return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true));
            });
        }).finally([this] {
+            discard_unused_segments();
            // Now that the gate is closed and requests completed we are sure nobody else will pop()
            return clear_reserve_segments().finally([this] {
                return std::move(_reserve_replenisher).then_wrapped([this] (auto f) {
@@ -1588,7 +1590,7 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
        bool failed = false;

        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f, o, make_file_input_stream_options())), start_off(o) {
+                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
        }
        work(work&&) = default;

--- a/db/commitlog/commitlog_entry.cc
+++ b/db/commitlog/commitlog_entry.cc
@@ -34,48 +34,26 @@
 #include "idl/mutation.dist.impl.hh"
 #include "idl/commitlog.dist.impl.hh"

-commitlog_entry::commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
-    : _mapping(std::move(mapping))
-      , _mutation_storage(std::move(mutation))
-      , _mutation(*_mutation_storage)
-{ }
-
-commitlog_entry::commitlog_entry(stdx::optional<column_mapping> mapping, const frozen_mutation& mutation)
-    : _mapping(std::move(mapping))
-      , _mutation(mutation)
-{ }
-
-commitlog_entry::commitlog_entry(commitlog_entry&& ce)
-    : _mapping(std::move(ce._mapping))
-    , _mutation_storage(std::move(ce._mutation_storage))
-    , _mutation(_mutation_storage ? *_mutation_storage : ce._mutation)
-{
-}
-
-commitlog_entry& commitlog_entry::operator=(commitlog_entry&& ce)
-{
-    if (this != &ce) {
-        this->~commitlog_entry();
-        new (this) commitlog_entry(std::move(ce));
-    }
-    return *this;
-}
-
-commitlog_entry commitlog_entry_writer::get_entry() const {
-    if (_with_schema) {
-        return commitlog_entry(_schema->get_column_mapping(), _mutation);
-    } else {
-        return commitlog_entry({}, _mutation);
-    }
+template<typename Output>
+void commitlog_entry_writer::serialize(Output& out) const {
+    [this, wr = ser::writer_of_commitlog_entry<Output>(out)] () mutable {
+        if (_with_schema) {
+            return std::move(wr).write_mapping(_schema->get_column_mapping());
+        } else {
+            return std::move(wr).skip_mapping();
+        }
+    }().write_mutation(_mutation).end_commitlog_entry();
 }

 void commitlog_entry_writer::compute_size() {
-    _size = ser::get_sizeof(get_entry());
+    seastar::measuring_output_stream ms;
+    serialize(ms);
+    _size = ms.size();
 }

 void commitlog_entry_writer::write(data_output& out) const {
    seastar::simple_output_stream str(out.reserve(size()), size());
-    ser::serialize(str, get_entry());
+    serialize(str);
 }

 commitlog_entry_reader::commitlog_entry_reader(const temporary_buffer<char>& buffer)
--- a/db/commitlog/commitlog_entry.hh
+++ b/db/commitlog/commitlog_entry.hh
@@ -31,15 +31,10 @@ namespace stdx = std::experimental;

 class commitlog_entry {
    stdx::optional<column_mapping> _mapping;
-    stdx::optional<frozen_mutation> _mutation_storage;
-    const frozen_mutation& _mutation;
+    frozen_mutation _mutation;
 public:
-    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation);
-    commitlog_entry(stdx::optional<column_mapping> mapping, const frozen_mutation& mutation);
-    commitlog_entry(commitlog_entry&&);
-    commitlog_entry(const commitlog_entry&) = delete;
-    commitlog_entry& operator=(commitlog_entry&&);
-    commitlog_entry& operator=(const commitlog_entry&) = delete;
+    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
+        : _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
    const stdx::optional<column_mapping>& mapping() const { return _mapping; }
    const frozen_mutation& mutation() const { return _mutation; }
 };
@@ -50,8 +45,9 @@ class commitlog_entry_writer {
    bool _with_schema = true;
    size_t _size;
 private:
+    template<typename Output>
+    void serialize(Output&) const;
    void compute_size();
-    commitlog_entry get_entry() const;
 public:
    commitlog_entry_writer(schema_ptr s, const frozen_mutation& fm)
        : _schema(std::move(s)), _mutation(fm)
@@ -88,4 +84,4 @@ public:

    const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
    const frozen_mutation& mutation() const { return _ce.mutation(); }
-};
+};
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -61,13 +61,19 @@

 static logging::logger logger("commitlog_replayer");

-struct column_mappings {
-    std::unordered_map<table_schema_version, column_mapping> map;
-    future<> stop() { return make_ready_future<>(); }
-};
-
 class db::commitlog_replayer::impl {
-    seastar::sharded<column_mappings> _column_mappings;
+    struct column_mappings {
+        std::unordered_map<table_schema_version, column_mapping> map;
+        future<> stop() { return make_ready_future<>(); }
+    };
+
+    // we want the processing methods to be const, since they use
+    // shard-sharing of data -> read only
+    // this one is special since it is thread local.
+    // Should actually make sharded::local a const function (it does
+    // not modify content), but...
+    mutable seastar::sharded<column_mappings> _column_mappings;
+
    friend class db::commitlog_replayer;
 public:
    impl(seastar::sharded<cql3::query_processor>& db);
@@ -94,13 +100,35 @@ public:
        }
    };

-    future<> process(stats*, temporary_buffer<char> buf, replay_position rp);
-    future<stats> recover(sstring file);
+    // move start/stop of the thread local bookkeep to "top level"
+    // and also make sure to assert on it actually being started.
+    future<> start() {
+        return _column_mappings.start();
+    }
+    future<> stop() {
+        return _column_mappings.stop();
+    }
+
+    future<> process(stats*, temporary_buffer<char> buf, replay_position rp) const;
+    future<stats> recover(sstring file) const;

    typedef std::unordered_map<utils::UUID, replay_position> rp_map;
    typedef std::unordered_map<unsigned, rp_map> shard_rpm_map;
    typedef std::unordered_map<unsigned, replay_position> shard_rp_map;

+    replay_position min_pos(unsigned shard) const {
+        auto i = _min_pos.find(shard);
+        return i != _min_pos.end() ? i->second : replay_position();
+    }
+    replay_position cf_min_pos(const utils::UUID& uuid, unsigned shard) const {
+        auto i = _rpm.find(shard);
+        if (i == _rpm.end()) {
+            return replay_position();
+        }
+        auto j = i->second.find(uuid);
+        return j != i->second.end() ? j->second : replay_position();
+    }
+
    seastar::sharded<cql3::query_processor>&
        _qp;
    shard_rpm_map
@@ -175,7 +203,6 @@ future<> db::commitlog_replayer::impl::init() {
                }
            }
        }
-
        for (auto&p : _min_pos) {
            logger.debug("minimum position for shard {}: {}", p.first, p.second);
        }
@@ -188,9 +215,11 @@ future<> db::commitlog_replayer::impl::init() {
 }

 future<db::commitlog_replayer::impl::stats>
-db::commitlog_replayer::impl::recover(sstring file) {
+db::commitlog_replayer::impl::recover(sstring file) const {
+    assert(_column_mappings.local_is_initialized());
+
    replay_position rp{commitlog::descriptor(file)};
-    auto gp = _min_pos[rp.shard_id()];
+    auto gp = min_pos(rp.shard_id());

    if (rp.id < gp.id) {
        logger.debug("skipping replay of fully-flushed {}", file);
@@ -220,7 +249,7 @@ db::commitlog_replayer::impl::recover(sstring file) {
    });
 }

-future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char> buf, replay_position rp) {
+future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char> buf, replay_position rp) const {
    try {

        commitlog_entry_reader cer(buf);
@@ -238,17 +267,16 @@ future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char>
        const column_mapping& src_cm = cm_it->second;

        auto shard_id = rp.shard_id();
-        if (rp < _min_pos[shard_id]) {
+        if (rp < min_pos(shard_id)) {
            logger.trace("entry {} is less than global min position. skipping", rp);
            s->skipped_mutations++;
            return make_ready_future<>();
        }

        auto uuid = fm.column_family_id();
-        auto& map = _rpm[shard_id];
-        auto i = map.find(uuid);
-        if (i != map.end() && rp <= i->second) {
-            logger.trace("entry {} at {} is younger than recorded replay position {}. skipping", fm.column_family_id(), rp, i->second);
+        auto cf_rp = cf_min_pos(uuid, shard_id);
+        if (rp <= cf_rp) {
+            logger.trace("entry {} at {} is younger than recorded replay position {}. skipping", fm.column_family_id(), rp, cf_rp);
            s->skipped_mutations++;
            return make_ready_future<>();
        }
@@ -323,42 +351,55 @@ future<db::commitlog_replayer> db::commitlog_replayer::create_replayer(seastar::
 }

 future<> db::commitlog_replayer::recover(std::vector<sstring> files) {
-  return _impl->_column_mappings.start().then([this, files = std::move(files)] {
+    typedef std::unordered_multimap<unsigned, sstring> shard_file_map;
+
    logger.info("Replaying {}", join(", ", files));
-    return map_reduce(files, [this](auto f) {
-        logger.debug("Replaying {}", f);
-        return _impl->recover(f).then([f](impl::stats stats) {
-            if (stats.corrupt_bytes != 0) {
-                logger.warn("Corrupted file: {}. {} bytes skipped.", f, stats.corrupt_bytes);
-            }
-            logger.debug("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
-                            , f
-                            , stats.applied_mutations
-                            , stats.invalid_mutations
-                            , stats.skipped_mutations
+
+    // pre-compute work per shard already.
+    auto map = ::make_lw_shared<shard_file_map>();
+    for (auto& f : files) {
+        commitlog::descriptor d(f);
+        replay_position p = d;
+        map->emplace(p.shard_id() % smp::count, std::move(f));
+    }
+
+    return _impl->start().then([this, map] {
+        return map_reduce(smp::all_cpus(), [this, map](unsigned id) {
+            return smp::submit_to(id, [this, id, map]() {
+                auto total = ::make_lw_shared<impl::stats>();
+                // TODO: or something. For now, we do this serialized per shard,
+                // to reduce mutation congestion. We could probably (says avi)
+                // do 2 segments in parallel or something, but lets use this first.
+                auto range = map->equal_range(id);
+                return do_for_each(range.first, range.second, [this, total](const std::pair<unsigned, sstring>& p) {
+                    auto&f = p.second;
+                    logger.debug("Replaying {}", f);
+                    return _impl->recover(f).then([f, total](impl::stats stats) {
+                        if (stats.corrupt_bytes != 0) {
+                            logger.warn("Corrupted file: {}. {} bytes skipped.", f, stats.corrupt_bytes);
+                        }
+                        logger.debug("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
+                                        , f
+                                        , stats.applied_mutations
+                                        , stats.invalid_mutations
+                                        , stats.skipped_mutations
+                        );
+                        *total += stats;
+                    });
+                }).then([total] {
+                    return make_ready_future<impl::stats>(*total);
+                });
+            });
+        }, impl::stats(), std::plus<impl::stats>()).then([](impl::stats totals) {
+            logger.info("Log replay complete, {} replayed mutations ({} invalid, {} skipped)"
+                            , totals.applied_mutations
+                            , totals.invalid_mutations
+                            , totals.skipped_mutations
            );
-            return make_ready_future<impl::stats>(stats);
-        }).handle_exception([f](auto ep) -> future<impl::stats> {
-            logger.error("Error recovering {}: {}", f, ep);
-            try {
-                std::rethrow_exception(ep);
-            } catch (std::invalid_argument&) {
-                logger.error("Scylla cannot process {}. Make sure to fully flush all Cassandra commit log files to sstable before migrating.", f);
-                throw;
-            } catch (...) {
-                throw;
-            }
        });
-    }, impl::stats(), std::plus<impl::stats>()).then([](impl::stats totals) {
-        logger.info("Log replay complete, {} replayed mutations ({} invalid, {} skipped)"
-                        , totals.applied_mutations
-                        , totals.invalid_mutations
-                        , totals.skipped_mutations
-        );
    }).finally([this] {
-        return _impl->_column_mappings.stop();
+        return _impl->stop();
    });
-  });
 }

 future<> db::commitlog_replayer::recover(sstring f) {
--- a/db/config.hh
+++ b/db/config.hh
@@ -326,7 +326,7 @@ public:
    val(sstable_preemptive_open_interval_in_mb, uint32_t, 50, Unused,     \
            "When compacting, the replacement opens SSTables before they are completely written and uses in place of the prior SSTables for any range previously written. This setting helps to smoothly transfer reads between the SSTables by reducing page cache churn and keeps hot rows hot."  \
    )                                                   \
-    val(defragment_memory_on_idle, bool, true, Used, "Set to true to defragment memory when the cpu is idle.  This reduces the amount of work Scylla performs when processing client requests.") \
+    val(defragment_memory_on_idle, bool, false, Used, "When set to true, will defragment memory when the cpu is idle.  This reduces the amount of work Scylla performs when processing client requests.") \
    /* Memtable settings */ \
    val(memtable_allocation_type, sstring, "heap_buffers", Invalid,     \
            "Specify the way Cassandra allocates and manages memtable memory. See Off-heap memtables in Cassandra 2.1. Options are:\n"  \
@@ -729,6 +729,8 @@ public:
    val(replace_address_first_boot, sstring, "", Used, "Like replace_address option, but if the node has been bootstrapped successfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.") \
    val(override_decommission, bool, false, Used, "Set true to force a decommissioned node to join the cluster") \
    val(ring_delay_ms, uint32_t, 30 * 1000, Used, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.") \
+    val(fd_max_interval_ms, uint32_t, 2 * 1000, Used, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.") \
+    val(fd_initial_value_ms, uint32_t, 2 * 1000, Used, "The initial failure_detector interval time in milliseconds.") \
    val(shutdown_announce_in_ms, uint32_t, 2 * 1000, Used, "Time a node waits after sending gossip shutdown message in milliseconds. Same as -Dcassandra.shutdown_announce_in_ms in cassandra.") \
    val(developer_mode, bool, false, Used, "Relax environment checks. Setting to true can reduce performance and reliability significantly.") \
    val(skip_wait_for_gossip_to_settle, int32_t, -1, Used, "An integer to configure the wait for gossip to settle. -1: wait normally, 0: do not wait at all, n: wait for at most n polls. Same as -Dcassandra.skip_wait_for_gossip_to_settle in cassandra.") \
@@ -739,6 +741,7 @@ public:
    val(prometheus_prefix, sstring, "scylla", Used, "Set the prefix of the exported Prometheus metrics. Changing this will break Scylla's dashboard compatibility, do not change unless you know what you are doing.") \
    val(abort_on_lsa_bad_alloc, bool, false, Used, "Abort when allocation in LSA region fails") \
    val(murmur3_partitioner_ignore_msb_bits, unsigned, 0, Used, "Number of most siginificant token bits to ignore in murmur3 partitioner; increase for very large clusters") \
+    val(virtual_dirty_soft_limit, double, 0.6, Used, "Soft limit of virtual dirty memory expressed as a portion of the hard limit") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
--- a/db/consistency_level.hh
+++ b/db/consistency_level.hh
@@ -167,6 +167,14 @@ inline void assure_sufficient_live_nodes(
        const PendingRange& pending_endpoints = std::array<gms::inet_address, 0>()) {
    size_t need = block_for(ks, cl);

+    auto adjust_live_for_error = [] (size_t live, size_t pending) {
+        // DowngradingConsistencyRetryPolicy uses alive replicas count from Unavailable
+        // exception to adjust CL for retry. When pending node is present CL is increased
+        // by 1 internally, so reported number of live nodes has to be adjusted to take
+        // this into account
+        return pending <= live ? live - pending : 0;
+    };
+
    switch (cl) {
    case consistency_level::ANY:
        // local hint is acceptable, and local node is always live
@@ -181,7 +189,7 @@ inline void assure_sufficient_live_nodes(
        size_t pending = count_local_endpoints(pending_endpoints);
        if (local_live < need + pending) {
            cl_logger.debug("Local replicas {} are insufficient to satisfy LOCAL_QUORUM requirement of needed {} and pending {}", live_endpoints, local_live, pending);
-            throw exceptions::unavailable_exception(cl, need, local_live);
+            throw exceptions::unavailable_exception(cl, need, adjust_live_for_error(local_live, pending));
        }
        break;
    }
@@ -195,7 +203,7 @@ inline void assure_sufficient_live_nodes(
        size_t pending = pending_endpoints.size();
        if (live < need + pending) {
            cl_logger.debug("Live nodes {} do not satisfy ConsistencyLevel ({} required, {} pending)", live, need, pending);
-            throw exceptions::unavailable_exception(cl, need, live);
+            throw exceptions::unavailable_exception(cl, need, adjust_live_for_error(live, pending));
        }
        break;
    }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -77,6 +77,15 @@ namespace schema_tables {

 logging::logger logger("schema_tables");

+struct push_back_and_return {
+    std::vector<mutation> muts;
+
+    std::vector<mutation> operator()(mutation&& m) {
+        muts.emplace_back(std::move(m));
+        return std::move(muts);
+    }
+};
+
 struct qualified_name {
    sstring keyspace_name;
    sstring table_name;
@@ -547,6 +556,14 @@ read_schema_partition_for_table(distributed<service::storage_proxy>& proxy, sche
    return query_partition_mutation(proxy.local(), std::move(schema), std::move(cmd), std::move(keyspace_key));
 }

+future<mutation>
+read_keyspace_mutation(distributed<service::storage_proxy>& proxy, const sstring& keyspace_name) {
+    schema_ptr s = keyspaces();
+    auto key = partition_key::from_singular(*s, keyspace_name);
+    auto cmd = make_lw_shared<query::read_command>(s->id(), s->version(), query::full_slice);
+    return query_partition_mutation(proxy.local(), std::move(s), std::move(cmd), std::move(key));
+}
+
 static semaphore the_merge_lock {1};

 future<> merge_lock() {
@@ -832,39 +849,6 @@ static inline void collect_types(std::set<sstring>& keys, schema_result& result,
    }
 }

-static inline void ensure_type_is_unused(distributed<service::storage_proxy>& proxy, user_type type)
-{
-	// We don't want to drop a type unless it's not used anymore (mainly because
-    // if someone drops a type and recreates one with the same name but different
-    // definition with the previous name still in use, things can get messy).
-    // We have two places to check: 1) other user type that can nest the one
-    // we drop and 2) existing tables referencing the type (maybe in a nested
-    // way).
-
-    auto&& keyspace = type->_keyspace;
-    auto&& name = type->_name;
-    auto&& db = proxy.local().get_db().local();
-    auto&& ks = db.find_keyspace(type->_keyspace);
-
-    for (auto&& ut : ks.metadata()->user_types()->get_all_types() | boost::adaptors::map_values) {
-        if (ut->_keyspace == keyspace && ut->_name == name) {
-            continue;
-        }
-
-        if (ut->references_user_type(keyspace, name)) {
-            throw exceptions::invalid_request_exception(sprint("Cannot drop user type %s.%s as it is still used by user type %s", keyspace, type->get_name_as_string(), ut->get_name_as_string()));
-        }
-    }
-
-    for (auto&& cfm : ks.metadata()->cf_meta_data() | boost::adaptors::map_values) {
-        for (auto&& col : cfm->all_columns() | boost::adaptors::map_values) {
-            if (col->type->references_user_type(keyspace, name)) {
-                throw exceptions::invalid_request_exception(sprint("Cannot drop user type %s.%s as it is still used by table %s.%s", keyspace, type->get_name_as_string(), cfm->ks_name(), cfm->cf_name()));
-            }
-        }
-    }
-}
-
 // see the comments for merge_keyspaces()
 static void merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
 {
@@ -898,10 +882,6 @@ static void merge_types(distributed<service::storage_proxy>& proxy, schema_resul
        }
    }

-    for (auto&& ut : dropped) {
-        ensure_type_is_unused(proxy, ut);
-    }
-
    proxy.local().get_db().invoke_on_all([&created, &dropped, &altered] (database& db) {
        return seastar::async([&] {
            for (auto&& type : created) {
@@ -1182,19 +1162,18 @@ void add_type_to_schema_mutation(user_type type, api::timestamp_type timestamp,
    mutations.emplace_back(std::move(m));
 }

-std::vector<mutation> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    add_type_to_schema_mutation(type, timestamp, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

-std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
-
+    std::vector<mutation> mutations;
    schema_ptr s = usertypes();
    auto pkey = partition_key::from_singular(*s, type->_keyspace);
    auto ckey = clustering_key::from_singular(*s, type->get_name_as_string());
@@ -1202,19 +1181,21 @@ std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata>
    m.partition().apply_delete(*s, ckey, tombstone(timestamp, gc_clock::now()));
    mutations.emplace_back(std::move(m));

-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 /*
 * Table metadata serialization/deserialization.
 */

-std::vector<mutation> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    add_table_or_view_to_schema_mutation(table, timestamp, true, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_type timestamp, bool with_columns_and_triggers)
@@ -1347,15 +1328,13 @@ static void make_update_columns_mutations(schema_ptr old_table,
    mutations.emplace_back(std::move(columns_mutation));
 }

-std::vector<mutation> make_update_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
+future<std::vector<mutation>> make_update_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
    schema_ptr old_table,
    schema_ptr new_table,
    api::timestamp_type timestamp,
    bool from_thrift)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
-
+    std::vector<mutation> mutations;
    add_table_or_view_to_schema_mutation(new_table, timestamp, false, mutations);

    make_update_columns_mutations(std::move(old_table), std::move(new_table), timestamp, from_thrift, mutations);
@@ -1373,7 +1352,8 @@ std::vector<mutation> make_update_table_mutations(lw_shared_ptr<keyspace_metadat
            addTriggerToSchemaMutation(newTable, trigger, timestamp, mutation);

 #endif
-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 static void make_drop_table_or_view_mutations(schema_ptr schema_table,
@@ -1390,10 +1370,9 @@ static void make_drop_table_or_view_mutations(schema_ptr schema_table,
    }
 }

-std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    make_drop_table_or_view_mutations(columnfamilies(), std::move(table), timestamp, mutations);

 #if 0
@@ -1405,7 +1384,8 @@ std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata>
    for (String indexName : Keyspace.open(keyspace.name).getColumnFamilyStore(table.cfName).getBuiltIndexes())
        indexCells.addTombstone(indexCells.getComparator().makeCellName(indexName), ldt, timestamp);
 #endif
-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 static future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy, const qualified_name& table, schema_ptr s)
@@ -1481,12 +1461,16 @@ future<schema_ptr> create_table_from_table_row(distributed<service::storage_prox
    return create_table_from_name(proxy, ks_name, cf_name);
 }

-void prepare_builder_from_table_row(schema_builder& builder, const query::result_set_row& table_row)
+void prepare_builder_from_table_row(schema_builder& builder, const query::result_set_row& table_row, bool is_dense)
 {
-
    auto comparator = table_row.get_nonnull<sstring>("comparator");
    bool is_compound = cell_comparator::check_compound(comparator);
    builder.set_is_compound(is_compound);
+    if (!is_compound && !is_dense) { // For thrift dynamic tables, the comparator type is encoded in the clustering keys
+        auto regular_column_name_type = db::marshal::type_parser::parse(comparator);
+        builder.set_regular_column_name_type(regular_column_name_type);
+    }
+
    cell_comparator::read_collections(builder, comparator);

    if (table_row.has("read_repair_chance")) {
@@ -1602,13 +1586,6 @@ schema_ptr create_table_from_mutations(schema_mutations sm, std::experimental::o
    AbstractType<?> fullRawComparator = CFMetaData.makeRawAbstractType(rawComparator, subComparator);
 #endif

-    std::vector<column_definition> column_defs = create_columns_from_column_rows(
-            query::result_set(sm.columns_mutation()),
-            ks_name,
-            cf_name,/*,
-            fullRawComparator, */
-            cf == cf_type::super);
-
    bool is_dense;
    if (table_row.has("is_dense")) {
        is_dense = table_row.get_nonnull<bool>("is_dense");
@@ -1617,6 +1594,16 @@ schema_ptr create_table_from_mutations(schema_mutations sm, std::experimental::o
        // is_dense = CFMetaData.calculateIsDense(fullRawComparator, columnDefs);
        throw std::runtime_error(sprint("%s not implemented", __PRETTY_FUNCTION__));
    }
+    builder.set_is_dense(is_dense);
+
+    prepare_builder_from_table_row(builder, table_row, is_dense);
+
+    std::vector<column_definition> column_defs = create_columns_from_column_rows(
+            query::result_set(sm.columns_mutation()),
+            ks_name,
+            cf_name,
+            builder.regular_column_name_type(),
+            cf == cf_type::super);

 #if 0
    CellNameType comparator = CellNames.fromAbstractType(fullRawComparator, isDense);
@@ -1628,9 +1615,6 @@ schema_ptr create_table_from_mutations(schema_mutations sm, std::experimental::o

    CFMetaData cfm = new CFMetaData(ksName, cfName, cfType, comparator, cfId);
 #endif
-    builder.set_is_dense(is_dense);
-
-    prepare_builder_from_table_row(builder, table_row);

    for (auto&& cdef : column_defs) {
        builder.with_column(cdef);
@@ -1662,7 +1646,8 @@ void add_column_to_schema_mutation(schema_ptr table,
                                   api::timestamp_type timestamp,
                                   mutation& m)
 {
-    auto ckey = clustering_key::from_exploded(*m.schema(), {utf8_type->decompose(table->cf_name()), column.name()});
+    auto ckey = clustering_key::from_exploded(*m.schema(), {utf8_type->decompose(table->cf_name()),
+                                                            utf8_type->decompose(column.name_as_text())});
    m.set_clustered_cell(ckey, "validator", column.type->name(), timestamp);
    m.set_clustered_cell(ckey, "type", serialize_kind(column.kind), timestamp);
    if (!column.is_on_all_components()) {
@@ -1714,21 +1699,21 @@ void drop_column_from_schema_mutation(schema_ptr table, const column_definition&

 std::vector<column_definition> create_columns_from_column_rows(const query::result_set& rows,
                                                               const sstring& keyspace,
-                                                               const sstring& table, /*,
-                                                               AbstractType<?> rawComparator, */
+                                                               const sstring& table,
+                                                               data_type regular_column_name_type,
                                                               bool is_super)
 {
    std::vector<column_definition> columns;
    for (auto&& row : rows.rows()) {
-        columns.emplace_back(std::move(create_column_from_column_row(row, keyspace, table, /*, rawComparator, */ is_super)));
+        columns.emplace_back(std::move(create_column_from_column_row(row, keyspace, table, regular_column_name_type, is_super)));
    }
    return columns;
 }

 column_definition create_column_from_column_row(const query::result_set_row& row,
                                            sstring keyspace,
-                                            sstring table, /*,
-                                            AbstractType<?> rawComparator, */
+                                            sstring table,
+                                            data_type regular_column_name_type,
                                            bool is_super)
 {
    auto kind = deserialize_kind(row.get_nonnull<sstring>("type"));
@@ -1744,13 +1729,8 @@ column_definition create_column_from_column_row(const query::result_set_row& row
        componentIndex = 1; // A ColumnDefinition for super columns applies to the column component
 #endif

-#if 0
-    // Note: we save the column name as string, but we should not assume that it is an UTF8 name, we
-    // we need to use the comparator fromString method
-    AbstractType<?> comparator = kind == ColumnDefinition.Kind.REGULAR
-                               ? getComponentComparator(rawComparator, componentIndex)
-                               : UTF8Type.instance;
-#endif
+    auto comparator = kind == column_kind::regular_column ? regular_column_name_type : utf8_type;
+
    auto name_opt = row.get<sstring>("column_name");
    sstring name = name_opt ? *name_opt : sstring();

@@ -1769,8 +1749,7 @@ column_definition create_column_from_column_row(const query::result_set_row& row
    if (row.has("index_name"))
        indexName = row.getString("index_name");
 #endif
-    auto c = column_definition{utf8_type->decompose(name), validator, kind, component_index};
-    return c;
+    return column_definition{comparator->from_string(name), validator, kind, component_index};
 }

 /*
@@ -1788,7 +1767,7 @@ view_ptr create_view_from_mutations(schema_mutations sm, std::experimental::opti
    schema_builder builder{ks_name, cf_name, id};
    prepare_builder_from_table_row(builder, row);

-    auto column_defs = create_columns_from_column_rows(query::result_set(sm.columns_mutation()), ks_name, cf_name, false);
+    auto column_defs = create_columns_from_column_rows(query::result_set(sm.columns_mutation()), ks_name, cf_name, utf8_type, false);
    for (auto&& cdef : column_defs) {
        builder.with_column(cdef);
    }
@@ -1899,37 +1878,39 @@ schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timesta
    return s->is_view() ? make_view_mutations(view_ptr(s), timestamp, with_columns) : make_table_mutations(s, timestamp, with_columns);
 }

-std::vector<mutation> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    // And also the serialized base table.
    auto base = keyspace->cf_meta_data().at(view->view_info()->base_name());
    add_table_or_view_to_schema_mutation(base, timestamp, true, mutations);
    add_table_or_view_to_schema_mutation(view, timestamp, true, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

-std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
+future<std::vector<mutation>> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
                                                 view_ptr old_view,
                                                 view_ptr new_view,
                                                 api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    // And also the serialized base table.
    auto base = keyspace->cf_meta_data().at(new_view->view_info()->base_name());
    add_table_or_view_to_schema_mutation(base, timestamp, true, mutations);
    add_table_or_view_to_schema_mutation(new_view, timestamp, false, mutations);
    make_update_columns_mutations(old_view, new_view, timestamp, false, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

-std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp) {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+future<std::vector<mutation>> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp) {
+    std::vector<mutation> mutations;
    make_drop_table_or_view_mutations(views(), view, timestamp, mutations);
-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 #if 0
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -80,6 +80,7 @@ future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<ser

 future<schema_result_value_type>
 read_schema_partition_for_keyspace(distributed<service::storage_proxy>& proxy, const sstring& schema_table_name, const sstring& keyspace_name);
+future<mutation> read_keyspace_mutation(distributed<service::storage_proxy>&, const sstring& keyspace_name);

 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations);

@@ -95,17 +96,17 @@ std::vector<mutation> make_drop_keyspace_mutations(lw_shared_ptr<keyspace_metada

 lw_shared_ptr<keyspace_metadata> create_keyspace_from_schema_partition(const schema_result_value_type& partition);

-std::vector<mutation> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);

 std::vector<user_type> create_types_from_schema_partition(const schema_result_value_type& result);

-std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);

 void add_type_to_schema_mutation(user_type type, api::timestamp_type timestamp, std::vector<mutation>& mutations);

-std::vector<mutation> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);

-std::vector<mutation> make_update_table_mutations(
+future<std::vector<mutation>> make_update_table_mutations(
    lw_shared_ptr<keyspace_metadata> keyspace,
    schema_ptr old_table,
    schema_ptr new_table,
@@ -114,13 +115,13 @@ std::vector<mutation> make_update_table_mutations(

 future<std::map<sstring, schema_ptr>> create_tables_from_tables_partition(distributed<service::storage_proxy>& proxy, const schema_result::mapped_type& result);

-std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);

 future<schema_ptr> create_table_from_name(distributed<service::storage_proxy>& proxy, const sstring& keyspace, const sstring& table);

 future<schema_ptr> create_table_from_table_row(distributed<service::storage_proxy>& proxy, const query::result_set_row& row);

-void prepare_builder_from_table_row(schema_builder& builder, const query::result_set_row& table_row);
+void prepare_builder_from_table_row(schema_builder& builder, const query::result_set_row& table_row, bool is_dense = false);

 schema_ptr create_table_from_mutations(schema_mutations, std::experimental::optional<table_schema_version> version = {});

@@ -128,14 +129,14 @@ void drop_column_from_schema_mutation(schema_ptr table, const column_definition&

 std::vector<column_definition> create_columns_from_column_rows(const query::result_set& rows,
                                                               const sstring& keyspace,
-                                                               const sstring& table,/*,
-                                                               AbstractType<?> rawComparator, */
+                                                               const sstring& table,
+                                                               data_type regular_column_name_type,
                                                               bool is_super);

 column_definition create_column_from_column_row(const query::result_set_row& row,
                                                sstring keyspace,
-                                                sstring table, /*,
-                                                AbstractType<?> rawComparator, */
+                                                sstring table,
+                                                data_type regular_column_name_type,
                                                bool is_super);


@@ -149,11 +150,11 @@ schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timesta

 void add_table_or_view_to_schema_mutation(schema_ptr view, api::timestamp_type timestamp, bool with_columns, std::vector<mutation>& mutations);

-std::vector<mutation> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);

-std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr old_view, view_ptr new_view, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr old_view, view_ptr new_view, api::timestamp_type timestamp);

-std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);

 sstring serialize_kind(column_kind kind);
 column_kind deserialize_kind(sstring kind);
--- a/dht/byte_ordered_partitioner.cc
+++ b/dht/byte_ordered_partitioner.cc
@@ -21,6 +21,7 @@

 #include "byte_ordered_partitioner.hh"
 #include "utils/class_registrator.hh"
+#include "utils/div_ceil.hh"
 #include <boost/multiprecision/cpp_int.hpp>
 #include <boost/multiprecision/cpp_dec_float.hpp>

@@ -162,22 +163,17 @@ byte_ordered_partitioner::shard_of(const token& t) const {
 }

 token
-byte_ordered_partitioner::token_for_next_shard(const token& t) const {
+byte_ordered_partitioner::token_for_next_shard(const token& t, shard_id shard, unsigned spans) const {
    switch (t._kind) {
-    case token::kind::before_all_keys:
-        return token_for_next_shard(token(token::kind::key, managed_bytes{int8_t(0)}));
    case token::kind::after_all_keys:
        return maximum_token();
+    case token::kind::before_all_keys:
    case token::kind::key:
-        auto s = shard_of(t) + 1;
-        if (s == _shard_count) {
+        auto orig = shard_of(t);
+        if (shard <= orig || spans != 1) {
            return maximum_token();
        }
-        auto e = (s << 8) / _shard_count;
-        // Division truncates; adjust
-        while (((e * _shard_count) >> 8) != s) {
-            ++e;
-        }
+        auto e = div_ceil(shard << 8, _shard_count);
        return token(token::kind::key, managed_bytes({int8_t(e)}));
    }
    assert(0);
--- a/dht/byte_ordered_partitioner.hh
+++ b/dht/byte_ordered_partitioner.hh
@@ -29,10 +29,9 @@
 namespace dht {

 class byte_ordered_partitioner final : public i_partitioner {
-    unsigned _shard_count;
 public:
-    byte_ordered_partitioner(unsigned shard_count = smp::count, unsigned ignore_msb = 0) : _shard_count(shard_count) {}
-    virtual const sstring name() { return "org.apache.cassandra.dht.ByteOrderedPartitioner"; }
+    byte_ordered_partitioner(unsigned shard_count = smp::count, unsigned ignore_msb = 0) : i_partitioner(shard_count) {}
+    virtual const sstring name() const { return "org.apache.cassandra.dht.ByteOrderedPartitioner"; }
    virtual token get_token(const schema& s, partition_key_view key) override {
        auto&& legacy = key.legacy_form(s);
        return token(token::kind::key, bytes(legacy.begin(), legacy.end()));
@@ -75,7 +74,7 @@ public:
        }
    }
    virtual unsigned shard_of(const token& t) const override;
-    virtual token token_for_next_shard(const token& t) const override;
+    virtual token token_for_next_shard(const token& t, shard_id shard, unsigned spans) const override;
 };

 }
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -25,6 +25,7 @@
 #include "utils/class_registrator.hh"
 #include "types.hh"
 #include "utils/murmur_hash.hh"
+#include "utils/div_ceil.hh"
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/irange.hpp>
 #include <boost/range/adaptor/transformed.hpp>
@@ -160,7 +161,7 @@ std::ostream& operator<<(std::ostream& out, const decorated_key& dk) {
 }

 // FIXME: make it per-keyspace
-std::unique_ptr<i_partitioner> default_partitioner { new murmur3_partitioner };
+std::unique_ptr<i_partitioner> default_partitioner;

 void set_global_partitioner(const sstring& class_name, unsigned ignore_msb)
 {
@@ -176,6 +177,9 @@ void set_global_partitioner(const sstring& class_name, unsigned ignore_msb)

 i_partitioner&
 global_partitioner() {
+    if (!default_partitioner) {
+        default_partitioner = std::make_unique<murmur3_partitioner>(smp::count, 12);
+    }
    return *default_partitioner;
 }

@@ -251,13 +255,35 @@ unsigned shard_of(const token& t) {
    return global_partitioner().shard_of(t);
 }

+stdx::optional<dht::token_range>
+selective_token_range_sharder::next() {
+    if (_done) {
+        return {};
+    }
+    while (_range.overlaps(dht::token_range(_start_boundary, {}), dht::token_comparator())
+            && !(_start_boundary && _start_boundary->value() == maximum_token())) {
+        auto end_token = _partitioner.token_for_next_shard(_start_token, _next_shard);
+        auto candidate = dht::token_range(std::move(_start_boundary), range_bound<dht::token>(end_token, false));
+        auto intersection = _range.intersection(std::move(candidate), dht::token_comparator());
+        _start_token = _partitioner.token_for_next_shard(end_token, _shard);
+        _start_boundary = range_bound<dht::token>(_start_token);
+        if (intersection) {
+            return *intersection;
+        }
+    }
+
+    _done = true;
+    return {};
+}
+
 stdx::optional<ring_position_range_and_shard>
 ring_position_range_sharder::next(const schema& s) {
    if (_done) {
        return {};
    }
-    auto shard = _range.start() ? shard_of(_range.start()->value().token()) : global_partitioner().shard_of_minimum_token();
-    auto shard_boundary_token = _partitioner.token_for_next_shard(_range.start() ? _range.start()->value().token() : minimum_token());
+    auto shard = _range.start() ? _partitioner.shard_of(_range.start()->value().token()) : _partitioner.shard_of_minimum_token();
+    auto next_shard = shard + 1 < _partitioner.shard_count() ? shard + 1 : 0;
+    auto shard_boundary_token = _partitioner.token_for_next_shard(_range.start() ? _range.start()->value().token() : minimum_token(), next_shard);
    auto shard_boundary = ring_position::starting_at(shard_boundary_token);
    if ((!_range.end() || shard_boundary.less_compare(s, _range.end()->value()))
            && shard_boundary_token != maximum_token()) {
@@ -273,6 +299,96 @@ ring_position_range_sharder::next(const schema& s) {
    return ring_position_range_and_shard{std::move(_range), shard};
 }

+
+ring_position_exponential_sharder::ring_position_exponential_sharder(const i_partitioner& partitioner, partition_range pr)
+        : _partitioner(partitioner)
+        , _range(std::move(pr))
+        , _last_ends(_partitioner.shard_count()) {
+    if (_range.start()) {
+        _first_shard = _next_shard = _partitioner.shard_of(_range.start()->value().token());
+    }
+}
+
+ring_position_exponential_sharder::ring_position_exponential_sharder(partition_range pr)
+        : ring_position_exponential_sharder(global_partitioner(), std::move(pr)) {
+}
+
+stdx::optional<ring_position_exponential_sharder_result>
+ring_position_exponential_sharder::next(const schema& s) {
+    auto ret = ring_position_exponential_sharder_result{};
+    ret.per_shard_ranges.reserve(std::min(_spans_per_iteration, _partitioner.shard_count()));
+    ret.inorder = _spans_per_iteration <= _partitioner.shard_count();
+    unsigned spans_to_go = _spans_per_iteration;
+    auto cmp = ring_position_comparator(s);
+    auto spans_per_shard = _spans_per_iteration / _partitioner.shard_count();
+    auto shards_with_extra_span = _spans_per_iteration % _partitioner.shard_count();
+    auto first_shard = _next_shard;
+    _next_shard = (_next_shard + _spans_per_iteration) % _partitioner.shard_count();
+    for (auto i : boost::irange(0u, std::min(_partitioner.shard_count(), _spans_per_iteration))) {
+        auto shard = (first_shard + i) % _partitioner.shard_count();
+        if (_last_ends[shard] && *_last_ends[shard] == maximum_token()) {
+            continue;
+        }
+        range_bound<ring_position> this_shard_start = [&] {
+            if (_last_ends[shard]) {
+                return range_bound<ring_position>(ring_position::starting_at(*_last_ends[shard]));
+            } else {
+                return _range.start().value_or(range_bound<ring_position>(ring_position::starting_at(minimum_token())));
+            }
+        }();
+        // token_for_next_span() may give us the wrong boundary on the first pass, so add an extra span:
+        auto extra_span = !_last_ends[shard] && shard != _first_shard;
+        auto spans = spans_per_shard + unsigned(i < shards_with_extra_span);
+        auto boundary = _partitioner.token_for_next_shard(this_shard_start.value().token(), shard, spans + extra_span);
+        auto proposed_range = partition_range(this_shard_start, range_bound<ring_position>(ring_position::starting_at(boundary), false));
+        auto intersection = _range.intersection(proposed_range, cmp);
+        if (!intersection) {
+            continue;
+        }
+        spans_to_go -= spans;
+        auto this_shard_result = ring_position_range_and_shard{std::move(*intersection), shard};
+        _last_ends[shard] = boundary;
+        ret.per_shard_ranges.push_back(std::move(this_shard_result));
+    }
+    if (ret.per_shard_ranges.empty()) {
+        return stdx::nullopt;
+    }
+    _spans_per_iteration *= 2;
+    return stdx::make_optional(std::move(ret));
+}
+
+
+ring_position_exponential_vector_sharder::ring_position_exponential_vector_sharder(const std::vector<nonwrapping_range<ring_position>>& ranges)
+        : _ranges(std::begin(ranges), std::end(ranges)) {
+    if (!_ranges.empty()) {
+        _current_sharder.emplace(_ranges.front());
+        _ranges.pop_front();
+        ++_element;
+    }
+}
+
+stdx::optional<ring_position_exponential_vector_sharder_result>
+ring_position_exponential_vector_sharder::next(const schema& s) {
+    if (!_current_sharder) {
+        return stdx::nullopt;
+    }
+    while (true) {  // yuch
+        auto ret = _current_sharder->next(s);
+        if (ret) {
+            auto augmented = ring_position_exponential_vector_sharder_result{std::move(*ret), _element};
+            return stdx::make_optional(std::move(augmented));
+        }
+        if (_ranges.empty()) {
+            _current_sharder = stdx::nullopt;
+            return stdx::nullopt;
+        }
+        _current_sharder.emplace(_ranges.front());
+        _ranges.pop_front();
+        ++_element;
+    }
+}
+
+
 ring_position_range_vector_sharder::ring_position_range_vector_sharder(dht::partition_range_vector ranges)
        : _ranges(std::move(ranges))
        , _current_range(_ranges.begin()) {
@@ -300,6 +416,33 @@ int ring_position_comparator::operator()(const ring_position& lh, const ring_pos
    return lh.tri_compare(s, rh);
 }

+std::vector<partition_range>
+split_range_to_single_shard(const i_partitioner& partitioner, const schema& s, const partition_range& pr, shard_id shard) {
+    auto cmp = ring_position_comparator(s);
+    auto ret = std::vector<partition_range>();
+    auto next_shard = shard + 1 == partitioner.shard_count() ? 0 : shard + 1;
+    auto start_token = pr.start() ? pr.start()->value().token() : minimum_token();
+    auto start_shard = partitioner.shard_of(start_token);
+    auto start_boundary = start_shard == shard ? pr.start() : range_bound<ring_position>(ring_position::starting_at(partitioner.token_for_next_shard(start_token, shard)));
+    while (pr.overlaps(partition_range(start_boundary, {}), cmp)
+            && !(start_boundary && start_boundary->value().token() == maximum_token())) {
+        auto end_token = partitioner.token_for_next_shard(start_token, next_shard);
+        auto candidate = partition_range(std::move(start_boundary), range_bound<ring_position>(ring_position::starting_at(end_token), false));
+        auto intersection = pr.intersection(std::move(candidate), cmp);
+        if (intersection) {
+            ret.push_back(std::move(*intersection));
+        }
+        start_token = partitioner.token_for_next_shard(end_token, shard);
+        start_boundary = range_bound<ring_position>(ring_position::starting_at(start_token));
+    }
+    return ret;
+}
+
+std::vector<partition_range>
+split_range_to_single_shard(const schema& s, const partition_range& pr, shard_id shard) {
+    return split_range_to_single_shard(global_partitioner(), s, pr, shard);
+}
+
 int token_comparator::operator()(const token& t1, const token& t2) const {
    return tri_compare(t1, t2);
 }
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -180,7 +180,10 @@ public:
 using decorated_key_opt = std::experimental::optional<decorated_key>;

 class i_partitioner {
+protected:
+    unsigned _shard_count;
 public:
+    explicit i_partitioner(unsigned shard_count) : _shard_count(shard_count) {}
    virtual ~i_partitioner() {}

    /**
@@ -272,7 +275,7 @@ public:
    /**
     * @return name of partitioner.
     */
-    virtual const sstring name() = 0;
+    virtual const sstring name() const = 0;

    /**
     * Calculates the shard that handles a particular token.
@@ -280,9 +283,17 @@ public:
    virtual unsigned shard_of(const token& t) const = 0;

    /**
-     * Gets the first token greater than `t` that is not in the same shard as `t`.
+     * Gets the first token greater than `t` that is in shard `shard`, and is a shard boundary (its first token).
+     *
+     * If the `spans` parameter is greater than zero, the result is the same as if the function
+     * is called `spans` times, each time applied to its return value, but efficiently. This allows
+     * selecting ranges that include multiple round trips around the 0..smp::count-1 shard span:
+     *
+     *     token_for_next_shard(t, shard, spans) == token_for_next_shard(token_for_shard(t, shard, 1), spans - 1)
+     *
+     * On overflow, maximum_token() is returned.
     */
-    virtual token token_for_next_shard(const token& t) const = 0;
+    virtual token token_for_next_shard(const token& t, shard_id shard, unsigned spans = 1) const = 0;

    /**
     * Gets the first shard of the minimum token.
@@ -315,6 +326,13 @@ public:
        return tri_compare(t1, t2) < 0;
    }

+    /**
+     * @return number of shards configured for this partitioner
+     */
+    unsigned shard_count() const {
+        return _shard_count;
+    }
+
    friend bool operator==(const token& t1, const token& t2);
    friend bool operator<(const token& t1, const token& t2);
    friend int tri_compare(const token& t1, const token& t2);
@@ -476,6 +494,44 @@ struct ring_position_range_and_shard_and_element : ring_position_range_and_shard
    unsigned element;
 };

+struct ring_position_exponential_sharder_result {
+    std::vector<ring_position_range_and_shard> per_shard_ranges;
+    bool inorder = true;
+};
+
+// given a ring_position range, generates exponentially increasing
+// sets per-shard sub-ranges
+class ring_position_exponential_sharder {
+    const i_partitioner& _partitioner;
+    partition_range _range;
+    unsigned _spans_per_iteration = 1;
+    unsigned _first_shard = 0;
+    unsigned _next_shard = 0;
+    std::vector<stdx::optional<token>> _last_ends; // index = shard
+public:
+    explicit ring_position_exponential_sharder(partition_range pr);
+    explicit ring_position_exponential_sharder(const i_partitioner& partitioner, partition_range pr);
+    stdx::optional<ring_position_exponential_sharder_result> next(const schema& s);
+};
+
+struct ring_position_exponential_vector_sharder_result : ring_position_exponential_sharder_result {
+    ring_position_exponential_vector_sharder_result(ring_position_exponential_sharder_result rpesr, unsigned element)
+            : ring_position_exponential_sharder_result(std::move(rpesr)), element(element) {}
+    unsigned element; // range within vector from which this result came
+};
+
+
+// given a vector of sorted, disjoint ring_position ranges, generates exponentially increasing
+// sets per-shard sub-ranges.  May be non-exponential when moving from one ring position range to another.
+class ring_position_exponential_vector_sharder {
+    std::deque<nonwrapping_range<ring_position>> _ranges;
+    stdx::optional<ring_position_exponential_sharder> _current_sharder;
+    unsigned _element = 0;
+public:
+    explicit ring_position_exponential_vector_sharder(const std::vector<nonwrapping_range<ring_position>>& ranges);
+    stdx::optional<ring_position_exponential_vector_sharder_result> next(const schema& s);
+};
+
 class ring_position_range_vector_sharder {
    using vec_type = dht::partition_range_vector;
    vec_type _ranges;
@@ -504,6 +560,33 @@ split_range_to_shards(dht::partition_range pr, const schema& s);
 std::map<unsigned, dht::partition_range_vector>
 split_ranges_to_shards(const dht::token_range_vector& ranges, const schema& s);

+// Intersect a partition_range with a shard and return the the resulting sub-ranges, in sorted order
+std::vector<partition_range> split_range_to_single_shard(const schema& s, const dht::partition_range& pr, shard_id shard);
+std::vector<partition_range> split_range_to_single_shard(const i_partitioner& partitioner, const schema& s, const dht::partition_range& pr, shard_id shard);
+
+class selective_token_range_sharder {
+    const i_partitioner& _partitioner;
+    dht::token_range _range;
+    shard_id _shard;
+    bool _done = false;
+    shard_id _next_shard;
+    dht::token _start_token;
+    stdx::optional<range_bound<dht::token>> _start_boundary;
+public:
+    explicit selective_token_range_sharder(dht::token_range range, shard_id shard)
+            : selective_token_range_sharder(global_partitioner(), std::move(range), shard) {}
+    selective_token_range_sharder(const i_partitioner& partitioner, dht::token_range range, shard_id shard)
+            : _partitioner(partitioner)
+            , _range(std::move(range))
+            , _shard(shard)
+            , _next_shard(_shard + 1 == _partitioner.shard_count() ? 0 : _shard + 1)
+            , _start_token(_range.start() ? _range.start()->value() : minimum_token())
+            , _start_boundary(_partitioner.shard_of(_start_token) == shard ?
+                _range.start() : range_bound<dht::token>(_partitioner.token_for_next_shard(_start_token, shard))) {
+    }
+    stdx::optional<dht::token_range> next();
+};
+
 } // dht

 namespace std {
--- a/dht/murmur3_partitioner.cc
+++ b/dht/murmur3_partitioner.cc
@@ -24,9 +24,40 @@
 #include "sstables/key.hh"
 #include "utils/class_registrator.hh"
 #include <boost/lexical_cast.hpp>
+#include <boost/range/irange.hpp>

 namespace dht {

+inline
+unsigned
+murmur3_partitioner::zero_based_shard_of(uint64_t token, unsigned shards, unsigned sharding_ignore_msb_bits) {
+    // This is the master function, the inverses have to match it wrt. rounding errors.
+    token <<= sharding_ignore_msb_bits;
+    // Treat "token" as a fraction in the interval [0, 1); compute:
+    //    shard = floor((0.token) * shards)
+    return (uint128_t(token) * shards) >> 64;
+}
+
+std::vector<uint64_t>
+murmur3_partitioner::init_zero_based_shard_start(unsigned shards, unsigned sharding_ignore_msb_bits) {
+    // computes the inverse of zero_based_shard_of(). ret[s] will return the smallest token that belongs to s
+    if (shards == 1) {
+        // Avoid the while loops below getting confused finding the "edge" between two nonexistent shards
+        return std::vector<uint64_t>(1, uint64_t(0));
+    }
+    auto ret = std::vector<uint64_t>(shards);
+    for (auto s : boost::irange<unsigned>(0, shards)) {
+        uint64_t token = (uint128_t(s) << 64) / shards;
+        token >>= sharding_ignore_msb_bits;   // leftmost bits are ignored by zero_based_shard_of
+        // token is the start of the next shard, and can be slightly before due to rounding errors; adjust
+        while (zero_based_shard_of(token, shards, sharding_ignore_msb_bits) != s) {
+            ++token;
+        }
+        ret[s] = token;
+    }
+    return ret;
+}
+
 inline
 int64_t
 murmur3_partitioner::normalize(int64_t in) {
@@ -88,6 +119,16 @@ inline int64_t long_token(const token& t) {
    return net::ntoh(*lp);
 }

+uint64_t
+murmur3_partitioner::unbias(const token& t) const {
+    return uint64_t(long_token(t)) + uint64_t(std::numeric_limits<int64_t>::min());
+}
+
+token
+murmur3_partitioner::bias(uint64_t n) const {
+    return get_token(n - uint64_t(std::numeric_limits<int64_t>::min()));
+}
+
 sstring murmur3_partitioner::to_sstring(const token& t) const {
    return ::to_sstring(long_token(t));
 }
@@ -210,46 +251,43 @@ murmur3_partitioner::shard_of(const token& t) const {
        case token::kind::after_all_keys:
            return _shard_count - 1;
        case token::kind::key:
-            int64_t l = long_token(t);
-            // treat l as a fraction between 0 and 1 and use 128-bit arithmetic to
-            // divide that range evenly among shards:
-            uint64_t adjusted = uint64_t(l) + uint64_t(std::numeric_limits<int64_t>::min());
-            adjusted <<= _sharding_ignore_msb_bits;
-            return (__int128(adjusted) * _shard_count) >> 64;
+            uint64_t adjusted = unbias(t);
+            return zero_based_shard_of(adjusted, _shard_count, _sharding_ignore_msb_bits);
    }
    assert(0);
 }

 token
-murmur3_partitioner::token_for_next_shard(const token& t) const {
+murmur3_partitioner::token_for_next_shard(const token& t, shard_id shard, unsigned spans) const {
+    uint64_t n = 0;
    switch (t._kind) {
        case token::kind::before_all_keys:
-            return token_for_next_shard(get_token(std::numeric_limits<int64_t>::min() + 1));
+            break;
        case token::kind::after_all_keys:
            return maximum_token();
        case token::kind::key:
-            if (long_token(t) == std::numeric_limits<int64_t>::min()) {
-                return token_for_next_shard(get_token(std::numeric_limits<int64_t>::min() + 1));
-            }
-            using uint128 = unsigned __int128;
-            auto s = shard_of(t) + 1;
-            s = s < _shard_count ? s : 0;
-            int64_t l = long_token(t);
-            // treat l as a fraction between 0 and 1 and use 128-bit arithmetic to
-            // divide that range evenly among shards:
-            uint64_t adjusted = uint64_t(l) + uint64_t(std::numeric_limits<int64_t>::min());
-            auto mul = align_up(uint128(adjusted) * _shard_count + 1, uint128(1) << (64 - _sharding_ignore_msb_bits));
-            if (mul >> 64 == _shard_count) {
-                return maximum_token();
-            }
-            uint64_t e = mul / _shard_count;
-            while (((uint128(e << _sharding_ignore_msb_bits) * _shard_count) >> 64) != s) {
-                // division will round down, so correct for it
-                ++e;
-            }
-            return get_token(e + uint64_t(std::numeric_limits<int64_t>::min()));
+            n = unbias(t);
+            break;
    }
-    assert(0);
+    auto s = zero_based_shard_of(n, _shard_count, _sharding_ignore_msb_bits);
+
+    if (!_sharding_ignore_msb_bits) {
+        // This ought to be the same as the else branch, but avoids shifts by 64
+        n = _shard_start[shard];
+        if (spans > 1 || shard <= s) {
+            return maximum_token();
+        }
+    } else {
+        auto left_part = n >> (64 - _sharding_ignore_msb_bits);
+        left_part += spans - unsigned(shard > s);
+        if (left_part >= (1u << _sharding_ignore_msb_bits)) {
+            return maximum_token();
+        }
+        left_part <<= (64 - _sharding_ignore_msb_bits);
+        auto right_part = _shard_start[shard];
+        n = left_part | right_part;
+    }
+    return bias(n);
 }


--- a/dht/murmur3_partitioner.hh
+++ b/dht/murmur3_partitioner.hh
@@ -23,20 +23,21 @@

 #include "i_partitioner.hh"
 #include "bytes.hh"
+#include <vector>

 namespace dht {

 class murmur3_partitioner final : public i_partitioner {
-    unsigned _shard_count;
    unsigned _sharding_ignore_msb_bits;
+    std::vector<uint64_t> _shard_start = init_zero_based_shard_start(_shard_count, _sharding_ignore_msb_bits);
 public:
    murmur3_partitioner(unsigned shard_count = smp::count, unsigned sharding_ignore_msb_bits = 0)
-            : _shard_count(shard_count)
+            : i_partitioner(shard_count)
            // if one shard, ignore sharding_ignore_msb_bits as they will just cause needless
            // range breaks
            , _sharding_ignore_msb_bits(shard_count > 1 ? sharding_ignore_msb_bits : 0) {
    }
-    virtual const sstring name() { return "org.apache.cassandra.dht.Murmur3Partitioner"; }
+    virtual const sstring name() const { return "org.apache.cassandra.dht.Murmur3Partitioner"; }
    virtual token get_token(const schema& s, partition_key_view key) override;
    virtual token get_token(const sstables::key_view& key) override;
    virtual token get_random_token() override;
@@ -50,11 +51,16 @@ public:
    virtual dht::token from_bytes(bytes_view bytes) const override;

    virtual unsigned shard_of(const token& t) const override;
-    virtual token token_for_next_shard(const token& t) const override;
+    virtual token token_for_next_shard(const token& t, shard_id shard, unsigned spans) const override;
 private:
+    using uint128_t = unsigned __int128;
    static int64_t normalize(int64_t in);
    token get_token(bytes_view key);
    token get_token(uint64_t value) const;
+    token bias(uint64_t value) const;      // translate from a zero-baed range
+    uint64_t unbias(const token& t) const; // translate to a zero-baed range
+    static unsigned zero_based_shard_of(uint64_t zero_based_token, unsigned shards, unsigned sharding_ignore_msb_bits);
+    static std::vector<uint64_t> init_zero_based_shard_start(unsigned shards, unsigned sharding_ignore_msb_bits);
 };


--- a/dht/random_partitioner.cc
+++ b/dht/random_partitioner.cc
@@ -22,6 +22,7 @@
 #include "md5_hasher.hh"
 #include "random_partitioner.hh"
 #include "utils/class_registrator.hh"
+#include "utils/div_ceil.hh"
 #include <boost/multiprecision/cpp_int.hpp>

 namespace dht {
@@ -222,21 +223,20 @@ unsigned random_partitioner::shard_of(const token& t) const {
 }

 token
-random_partitioner::token_for_next_shard(const token& t) const {
+random_partitioner::token_for_next_shard(const token& t, shard_id shard, unsigned spans) const {
+    if (_shard_count == 1) {
+        return maximum_token();
+    }
    switch (t._kind) {
        case token::kind::after_all_keys:
            return maximum_token();
        case token::kind::before_all_keys:
        case token::kind::key:
-            auto s = shard_of(t) + 1;
-            if (s == _shard_count) {
+            auto orig = shard_of(t);
+            if (shard <= orig || spans != 1) {
                return maximum_token();
            }
-            auto t = (boost::multiprecision::uint256_t(s) << 127) / _shard_count;
-            // division truncates, so adjust
-            while (((t * _shard_count) >> 127) != s) {
-                ++t;
-            }
+            auto t = div_ceil(boost::multiprecision::uint256_t(shard) << 127, _shard_count);
            return cppint_to_token(t.convert_to<boost::multiprecision::uint128_t>());
    }
    assert(0);
--- a/dht/random_partitioner.hh
+++ b/dht/random_partitioner.hh
@@ -29,10 +29,9 @@
 namespace dht {

 class random_partitioner final : public i_partitioner {
-    unsigned _shard_count;
 public:
-    random_partitioner(unsigned shard_count = smp::count, unsigned ignore_msb = 0) : _shard_count(shard_count) {}
-    virtual const sstring name() { return "org.apache.cassandra.dht.RandomPartitioner"; }
+    random_partitioner(unsigned shard_count = smp::count, unsigned ignore_msb = 0) : i_partitioner(shard_count) {}
+    virtual const sstring name() const { return "org.apache.cassandra.dht.RandomPartitioner"; }
    virtual token get_token(const schema& s, partition_key_view key) override;
    virtual token get_token(const sstables::key_view& key) override;
    virtual token get_random_token() override;
@@ -46,7 +45,7 @@ public:
    virtual dht::token from_sstring(const sstring& t) const override;
    virtual dht::token from_bytes(bytes_view bytes) const override;
    virtual unsigned shard_of(const token& t) const override;
-    virtual token token_for_next_shard(const token& t) const override;
+    virtual token token_for_next_shard(const token& t, shard_id shard, unsigned spans) const override;
 private:
    token get_token(bytes data);
 };
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/modprobe.d/scylla-raid0.conf
+++ b/dist/common/modprobe.d/scylla-raid0.conf
@@ -0,0 +1 @@
+options raid0 devices_discard_performance=Y
--- a/dist/common/scripts/node_exporter_install
+++ b/dist/common/scripts/node_exporter_install
@@ -27,11 +27,11 @@ if [ -f /usr/bin/node_exporter ]; then
    exit 1
 fi

-version=0.12.0
+version=0.14.0
 dir=/usr/lib/scylla/Prometheus/node_exporter
 mkdir -p $dir
 cd $dir
-curl -L https://github.com/prometheus/node_exporter/releases/download/$version/node_exporter-$version.linux-amd64.tar.gz -o $dir/node_exporter-$version.linux-amd64.tar.gz
+curl -L https://github.com/prometheus/node_exporter/releases/download/v$version/node_exporter-$version.linux-amd64.tar.gz -o $dir/node_exporter-$version.linux-amd64.tar.gz
 tar -xvzf $dir/node_exporter-$version.linux-amd64.tar.gz
 rm $dir/node_exporter-$version.linux-amd64.tar.gz
 ln -s $dir/node_exporter-$version.linux-amd64/node_exporter /usr/bin
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -5,15 +5,20 @@
 . /usr/lib/scylla/scylla_lib.sh

 print_usage() {
-    echo "scylla-raid-setup --disks /dev/hda,/dev/hdb... --raiddev /dev/md0 --update-fstab"
+    echo "scylla-raid-setup --disks /dev/hda,/dev/hdb... --raiddev /dev/md0 --update-fstab --root /var/lib/scylla --volume-role [all|data|commitlog]"
    echo "  --disks	specify disks for RAID"
    echo "  --raiddev	MD device name for RAID"
    echo "  --update-fstab update /etc/fstab for RAID"
+    echo "  --root specify the root of the tree"
+    echo "  --volume-role specify how will this device be used (data, commitlog, or all)"
    exit 1
 }

 RAID=/dev/md0
 FSTAB=0
+ROOT=/var/lib/scylla
+ROLE="all"
+
 while [ $# -gt 0 ]; do
    case "$1" in
        "--disks")
@@ -29,12 +34,37 @@ while [ $# -gt 0 ]; do
            FSTAB=1
            shift 1
            ;;
+        "--root")
+            ROOT="$2"
+            shift 2
+            ;;
+        "--volume-role")
+            ROLE="$2"
+            shift 2
+            ;;
        *)
            print_usage
            ;;
    esac
 done

+ROOT=${ROOT%/}
+case "$ROLE" in
+    "all")
+        MOUNT_AT=$ROOT
+        ;;
+    "data")
+        MOUNT_AT="$ROOT/data"
+        ;;
+    "commitlog")
+        MOUNT_AT="$ROOT/commitlog"
+        ;;
+    *)
+        echo "Invalid role specified ($ROLE)"
+        print_usage
+        ;;
+esac
+
 if [ "$DISKS" = "" ]; then
    print_usage
 fi
@@ -51,8 +81,8 @@ if [ -e $RAID ]; then
    echo "$RAID is already using"
    exit 1
 fi
-if [ "`mount|grep /var/lib/scylla`" != "" ]; then
-    echo "/var/lib/scylla is already mounted"
+if mountpoint -q $MOUNT_AT; then
+    echo "$MOUNT_AT is already mounted"
    exit 1
 fi

@@ -61,18 +91,32 @@ if is_debian_variant; then
 else
    yum -y install mdadm xfsprogs
 fi
-mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
-mkfs.xfs $RAID -f
-echo "DEVICE $DISKS" > /etc/mdadm.conf
-mdadm --detail --scan >> /etc/mdadm.conf
+if [ "$ID" = "ubuntu" ] && [ "$VERSION_ID" = "14.04" ]; then
+    mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
+    mkfs.xfs $RAID -f
+else
+    for dsk in $DISKS; do
+        blkdiscard $dsk &
+    done
+    wait
+    mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
+    mkfs.xfs $RAID -f -K
+fi
+mdadm --detail --scan > /etc/mdadm.conf
+
+mkdir -p "$MOUNT_AT"
+mount -t xfs -o noatime $RAID "$MOUNT_AT"
+
+# create this unconditionally so we are more robust about ordering
+# if the script is run multiple times. But must do after mount in case
+# we are mounting the root
+mkdir -p "$ROOT/data"
+mkdir -p "$ROOT/commitlog"
+mkdir -p "$ROOT/coredump"
+chown scylla:scylla "$ROOT"
+chown scylla:scylla "$ROOT"/*
+
 if [ $FSTAB -ne 0 ]; then
    UUID=`blkid $RAID | awk '{print $2}'`
-    echo "$UUID /var/lib/scylla xfs noatime 0 0" >> /etc/fstab
+    echo "$UUID $MOUNT_AT xfs noatime 0 0" >> /etc/fstab
 fi
-mount -t xfs -o noatime $RAID /var/lib/scylla
-
-mkdir -p /var/lib/scylla/data
-mkdir -p /var/lib/scylla/commitlog
-mkdir -p /var/lib/scylla/coredump
-chown scylla:scylla /var/lib/scylla/*
-chown scylla:scylla /var/lib/scylla/
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -81,7 +81,7 @@ verify_package() {
 }

 list_block_devices() {
-    if lsblk --help | grep -q -e -p; then
+    if lsblk --help | grep -q -e '^\s*-p'; then
        lsblk -pnr | awk '{ print $1 }'
    else
        ls -1 /dev/sd* /dev/hd* /dev/xvd* /dev/nvme* /dev/mapper/*  2>/dev/null|grep -v control
@@ -218,6 +218,9 @@ while [ $# -gt 0 ]; do
            print_usage
            shift 1
            ;;
+        *)
+            echo "Invalid option: $@"
+            print_usage
    esac
 done

@@ -267,21 +270,24 @@ if [ $ENABLE_SERVICE -eq 1 ]; then
           printf "[housekeeping]\ncheck-version: True\n" > /etc/scylla.d/housekeeping.cfg
        fi
        if is_systemd; then
-            systemctl unmask scylla-housekeeping.timer
+            systemctl unmask scylla-housekeeping-daily.timer
+            systemctl unmask scylla-housekeeping-restart.timer
        fi
    else
        if [ ! -f /etc/scylla.d/housekeeping.cfg ]; then
           printf "[housekeeping]\ncheck-version: False\n" > /etc/scylla.d/housekeeping.cfg
        fi
        if is_systemd; then
-            systemctl mask scylla-housekeeping.timer
-            systemctl stop scylla-housekeeping.timer || true
+            systemctl mask scylla-housekeeping-daily.timer
+            systemctl mask scylla-housekeeping-restart.timer
+            systemctl stop scylla-housekeeping-daily.timer || true
+            systemctl stop scylla-housekeeping-restart.timer || true
        fi
    fi
 fi

 CUR_VERSION=`scylla --version` || true
-if [ "$CUR_VERSION" != "" ] && [ "$UUID" != "" ]; then
+if [ "$CUR_VERSION" != "" ]; then
    NEW_VERSION=`sudo -u scylla /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid version --version $CUR_VERSION --mode i` || true
    if [ "$NEW_VERSION" != "" ]; then
       echo $NEW_VERSION
--- a/dist/common/sysctl.d/99-scylla-sched.conf
+++ b/dist/common/sysctl.d/99-scylla-sched.conf
@@ -5,7 +5,7 @@ kernel.sched_tunable_scaling = 0
 kernel.sched_min_granularity_ns = 500000

 # Don't delay unrelated workloads
-kernel.sched_wakeup_granularity_ns = 500000
+kernel.sched_wakeup_granularity_ns = 450000

 # Schedule all tasks in this period
 kernel.sched_latency_ns = 1000000
--- a/dist/common/systemd/scylla-housekeeping-daily.service.in
+++ b/dist/common/systemd/scylla-housekeeping-daily.service.in
@@ -1,12 +1,12 @@
 [Unit]
-Description=Scylla Housekeeping
+Description=Scylla Housekeeping daily mode
 After=network.target

 [Service]
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg version --mode d
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode d

 [Install]
 WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-housekeeping-daily.timer
+++ b/dist/common/systemd/scylla-housekeeping-daily.timer
@@ -0,0 +1,11 @@
+[Unit]
+Description=Run Scylla Housekeeping daily mode
+After=scylla-server.service
+BindsTo=scylla-server.service
+
+[Timer]
+OnActiveSec=1d
+OnUnitActiveSec=1d
+
+[Install]
+WantedBy=timers.target
--- a/dist/common/systemd/scylla-housekeeping-restart.service.in
+++ b/dist/common/systemd/scylla-housekeeping-restart.service.in
@@ -0,0 +1,12 @@
+[Unit]
+Description=Scylla Housekeeping restart mode
+After=network.target
+
+[Service]
+Type=simple
+User=scylla
+Group=scylla
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode r
+
+[Install]
+WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-housekeeping-restart.timer
+++ b/dist/common/systemd/scylla-housekeeping-restart.timer
@@ -1,12 +1,11 @@
 [Unit]
-Description=Run Scylla Housekeeping daily
+Description=Run Scylla Housekeeping restart mode
 After=scylla-server.service
 BindsTo=scylla-server.service

 [Timer]
 # set OnActiveSec to 3 to safely avoid issues/1846
 OnActiveSec=3
-OnUnitActiveSec=1d

 [Install]
 WantedBy=timers.target
--- a/dist/common/systemd/scylla-server.service.in
+++ b/dist/common/systemd/scylla-server.service.in
@@ -2,7 +2,8 @@
 Description=Scylla Server
 After=network.target
 Wants=scylla-jmx.service
-Wants=scylla-housekeeping.timer
+Wants=scylla-housekeeping-restart.timer
+Wants=scylla-housekeeping-daily.timer

 [Service]
 PermissionsStartOnly=true
@@ -21,6 +22,9 @@ KillMode=process
 Restart=on-abnormal
 User=scylla
 OOMScoreAdjust=-950
+StandardOutput=syslog
+StandardError=syslog
+SyslogLevelPrefix=false

 [Install]
 WantedBy=multi-user.target
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -7,6 +7,14 @@ print_usage() {
    echo "  --rebuild-dep  rebuild dependency packages"
    exit 1
 }
+install_deps() {
+    echo Y | sudo mk-build-deps
+    DEB_FILE=`ls *-build-deps*.deb`
+    sudo gdebi -n $DEB_FILE
+    sudo rm -f $DEB_FILE
+    sudo dpkg -P ${DEB_FILE%%_*.deb}
+}
+
 REBUILD=0
 DIST=0
 while [ $# -gt 0 ]; do
@@ -54,6 +62,9 @@ fi
 if [ ! -f /usr/bin/lsb_release ]; then
    sudo apt-get -y install lsb-release
 fi
+if [ ! -f /usr/bin/gdebi ]; then
+    sudo apt-get -y install gdebi-core
+fi

 DISTRIBUTION=`lsb_release -i|awk '{print $3}'`
 CODENAME=`lsb_release -c|awk '{print $2}'`
@@ -84,7 +95,8 @@ if [ "$DISTRIBUTION" = "Debian" ]; then
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++-5, libunwind-dev/g" debian/control
    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
-    sed -i -e "s#@@HKDOTTIMER@@#dist/common/systemd/scylla-housekeeping.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_R@@#dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system#g" debian/scylla-server.install
    sed -i -e "s#@@SYSCTL@@#dist/debian/sysctl.d/99-scylla.conf etc/sysctl.d#g" debian/scylla-server.install
 elif [ "$VERSION_ID" = "14.04" ]; then
    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
@@ -92,7 +104,8 @@ elif [ "$VERSION_ID" = "14.04" ]; then
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/g++-5, libunwind8-dev/g" debian/control
    sed -i -e "s#@@INSTALL@@#dist/debian/sudoers.d/scylla etc/sudoers.d#g" debian/scylla-server.install
-    sed -i -e "s#@@HKDOTTIMER@@##g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_D@@##g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_R@@##g" debian/scylla-server.install
    sed -i -e "s#@@SYSCTL@@#dist/debian/sysctl.d/99-scylla.conf etc/sysctl.d#g" debian/scylla-server.install
 else
    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
@@ -100,7 +113,8 @@ else
    sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind-dev/g" debian/control
    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
-    sed -i -e "s#@@HKDOTTIMER@@#dist/common/systemd/scylla-housekeeping.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_R@@#dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system#g" debian/scylla-server.install
    sed -i -e "s#@@SYSCTL@@##g" debian/scylla-server.install
 fi
 if [ $DIST -gt 0 ]; then
@@ -116,7 +130,10 @@ fi

 cp dist/common/systemd/scylla-server.service.in debian/scylla-server.service
 sed -i -e "s#@@SYSCONFDIR@@#/etc/default#g" debian/scylla-server.service
-cp dist/common/systemd/scylla-housekeeping.service debian/scylla-server.scylla-housekeeping.service
+cp dist/common/systemd/scylla-housekeeping-daily.service.in debian/scylla-server.scylla-housekeeping-daily.service
+sed -i -e "s#@@REPOFILES@@#'/etc/apt/sources.list.d/scylla*.list'#g" debian/scylla-server.scylla-housekeeping-daily.service
+cp dist/common/systemd/scylla-housekeeping-restart.service.in debian/scylla-server.scylla-housekeeping-restart.service
+sed -i -e "s#@@REPOFILES@@#'/etc/apt/sources.list.d/scylla*.list'#g" debian/scylla-server.scylla-housekeeping-restart.service
 cp dist/common/systemd/node-exporter.service debian/scylla-server.node-exporter.service

 if [ "$VERSION_ID" = "14.04" ] && [ $REBUILD -eq 0 ]; then
@@ -140,5 +157,5 @@ else
    sudo apt-get install g++
 fi

-echo Y | sudo mk-build-deps -i -r
+install_deps
 debuild -r fakeroot -us -uc
--- a/dist/debian/control.in
+++ b/dist/debian/control.in
@@ -4,7 +4,7 @@ Homepage: http://scylladb.com
 Section: database
 Priority: optional
 Standards-Version: 3.9.5
-Build-Depends: debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, libthrift-dev, thrift-compiler, antlr3, antlr3-c++-dev, ragel, ninja-build, git, libboost-program-options1.55-dev | libboost-program-options-dev, libboost-filesystem1.55-dev | libboost-filesystem-dev, libboost-system1.55-dev | libboost-system-dev, libboost-thread1.55-dev | libboost-thread-dev, libboost-test1.55-dev | libboost-test-dev, libgnutls28-dev, libhwloc-dev, libnuma-dev, libpciaccess-dev, xfslibs-dev, python3-pyparsing, libxml2-dev, libsctp-dev, python-urwid, pciutils, libprotobuf-dev, protobuf-compiler, systemtap-sdt-dev, @@BUILD_DEPENDS@@
+Build-Depends: debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, libthrift-dev, thrift-compiler, antlr3, antlr3-c++-dev, ragel, ninja-build, git, libboost-program-options1.55-dev | libboost-program-options-dev, libboost-filesystem1.55-dev | libboost-filesystem-dev, libboost-system1.55-dev | libboost-system-dev, libboost-thread1.55-dev | libboost-thread-dev, libboost-test1.55-dev | libboost-test-dev, libgnutls28-dev, libhwloc-dev, libnuma-dev, libpciaccess-dev, xfslibs-dev, python3-pyparsing, libxml2-dev, libsctp-dev, python-urwid, pciutils, libprotobuf-dev, protobuf-compiler, systemtap-sdt-dev, libtool, automake, @@BUILD_DEPENDS@@

 Package: scylla-conf
 Architecture: any
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -1,7 +1,14 @@
-#!/bin/sh
+#!/bin/bash

 set -e

-sysctl -p/etc/sysctl.d/99-scylla-sched.conf
+KVER=$(uname -r)
+
+if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
+    echo "kernel $KVER detected, skip running sysctl..."
+else
+    # expect failures in virtualized environments
+    sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
+fi

 #DEBHELPER#
--- a/dist/debian/debian/scylla-server.scylla-housekeeping.upstart
+++ b/dist/debian/debian/scylla-server.scylla-housekeeping.upstart
@@ -29,10 +29,10 @@ setgid scylla
 script
 # make sure scylla is up before checking for the version
  sleep 5
-  /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -c /etc/scylla.d/housekeeping.cfg  -q version --mode r || true
+  /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/apt/sources.list.d/scylla*.list' -c /etc/scylla.d/housekeeping.cfg  -q version --mode r || true
  while [ 1 ]
  do
    sleep 1d
-    /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -c /etc/scylla.d/housekeeping.cfg  -q version --mode d || true
+    /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/apt/sources.list.d/scylla*.list' -c /etc/scylla.d/housekeeping.cfg  -q version --mode d || true
  done
 end script
--- a/dist/debian/debian/scylla-server.upstart
+++ b/dist/debian/debian/scylla-server.upstart
@@ -41,6 +41,7 @@ script
        fi
        . "$i"
    done
+    export SCYLLA_CONF SCYLLA_HOME
    exec /usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET
 end script

--- a/dist/debian/dep/build_dependency.sh
+++ b/dist/debian/dep/build_dependency.sh
@@ -1,7 +1,25 @@
 #!/bin/bash -e

 . /etc/os-release
+install_deps() {
+    echo Y | sudo mk-build-deps
+    DEB_FILE=`ls *-build-deps*.deb`
+    sudo gdebi -n $DEB_FILE
+    sudo rm -f $DEB_FILE
+    sudo dpkg -P ${DEB_FILE%%_*.deb}
+}
+
 DISTRIBUTION=`lsb_release -i|awk '{print $3}'`
+CODENAME=`lsb_release -c|awk '{print $2}'`
+
+# workaround fix for #2444
+if [ "$CODENAME" = "jessie" ]; then
+    if [ ! -e /etc/apt/sources.list.d/jessie-backports.list ]; then
+        sudo sh -c 'echo deb "http://httpredir.debian.org/debian jessie-backports main" > /etc/apt/sources.list.d/jessie-backports.list'
+    fi
+    sudo apt-get -y update
+    sudo apt-get install -t jessie-backports -y texlive
+fi

 sudo apt-get install -y gdebi-core
 if [ "$VERSION_ID" = "14.04" ] || [ "$DISTRIBUTION" = "Debian" ]; then
@@ -11,7 +29,7 @@ if [ "$VERSION_ID" = "14.04" ] || [ "$DISTRIBUTION" = "Debian" ]; then
        cp -a dist/debian/dep/antlr3-3.5.2/* build/antlr3-3.5.2
        cd build/antlr3-3.5.2
        wget -nv http://www.antlr3.org/download/antlr-3.5.2-complete-no-st3.jar
-        echo Y | sudo mk-build-deps -i -r
+        install_deps
        debuild -r fakeroot --no-tgz-check -us -uc
        cd -
    fi
@@ -39,7 +57,7 @@ if [ "$VERSION_ID" = "14.04" ] || [ "$DISTRIBUTION" = "Debian" ]; then
        cd -
        cd build/gdb-7.11
        patch -p0 < ../../dist/debian/dep/gdb.diff
-        echo Y | sudo mk-build-deps -i -r
+        install_deps
        debuild -r fakeroot --no-tgz-check -us -uc
        cd -
    fi
@@ -56,7 +74,7 @@ if [ ! -f build/antlr3-c++-dev_*.deb ]; then
    cd -
    cp -a dist/debian/dep/antlr3-c++-dev-3.5.2/debian build/antlr3-c++-dev-3.5.2
    cd build/antlr3-c++-dev-3.5.2
-    echo Y | sudo mk-build-deps -i -r
+    install_deps
    debuild -r fakeroot --no-tgz-check -us -uc
    cd -
 fi
@@ -70,17 +88,18 @@ if [ ! -f build/libthrift0_*.deb ]; then
    tar xpf thrift-0.9.3.tar.gz
    cd thrift-0.9.3
    patch -p0 < ../../dist/debian/dep/thrift.diff
-    echo Y | sudo mk-build-deps -i -r
+    install_deps
    debuild -r fakeroot --no-tgz-check -us -uc
    cd ../..
 fi

 if [ "$DISTRIBUTION" = "Debian" ] && [ "$VERSION_ID" = "8" ]; then
    if [ ! -f build/gcc-5_*.deb ]; then
-        sudo cp dist/debian/dep/debian-stretch-source.list /etc/apt/sources.list.d/
-        sudo apt-get update
        cd build
-        apt-get source gcc-5/stretch=5.4.1-2
+        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1-5.dsc
+        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1.orig.tar.gz
+        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1-5.diff.gz
+        dpkg-source -x gcc-5_5.4.1-5.dsc
        cd gcc-5-5.4.1
        # resolve build time dependencies manually, since mk-build-deps doesn't works for gcc package
        sudo apt-get install -y g++-multilib libc6-dev-i386 lib32gcc1 libc6-dev-x32 libx32gcc1 libc6-dbg m4 libtool autoconf2.64 autogen gawk zlib1g-dev systemtap-sdt-dev gperf bison flex gdb texinfo locales sharutils libantlr-java libffi-dev gnat-4.9 libisl-dev libmpc-dev libmpfr-dev libgmp-dev dejagnu realpath chrpath quilt doxygen graphviz ghostscript texlive-latex-base xsltproc libxml2-utils docbook-xsl-ns
--- a/dist/debian/dep/debian-gcc-5-jessie.diff
+++ b/dist/debian/dep/debian-gcc-5-jessie.diff
@@ -1,6 +1,5 @@
-diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
--- debian/rules.conf	2016-10-14 04:54:21.000000000 +0000
-+++ /home/syuu/gcc-5-5.4.1/debian/rules.conf	2016-10-12 17:28:54.138711378 +0000
+--- debian/rules.conf	2017-02-24 19:02:52.000000000 +0000
+++ /home/syuu/gcc-5.5/gcc-5-5.4.1/debian/rules.conf	2017-02-24 18:13:59.000000000 +0000
@@ -206,7 +206,7 @@
   ifneq (,$(filter $(distrelease),vivid))
     BINUTILSBDV = 2.25-3~
@@ -10,14 +9,16 @@ diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
   else ifneq (,$(filter $(distrelease),sid stretch xenial))
     BINUTILSBDV = 2.26.1
   endif
-@@ -387,9 +387,9 @@
+@@ -386,10 +386,10 @@
+   MPFR_BUILD_DEP = libmpfr-dev (>= 3.0.0-9~),
 endif
 
- ISL_BUILD_DEP = libisl-dev,
-ifneq (,$(filter $(distrelease),jessie sid experimental))
+-ISL_BUILD_DEP = libisl-dev,
+-ifneq (,$(filter $(distrelease),jessie stretch sid experimental))
 -  ISL_BUILD_DEP = libisl-dev (>= 0.14),
 -endif
-+#ifneq (,$(filter $(distrelease),jessie sid experimental))
+#ISL_BUILD_DEP = libisl-dev,
+#ifneq (,$(filter $(distrelease),jessie stretch sid experimental))
 +#  ISL_BUILD_DEP = libisl-dev (>= 0.14),
 +#endif
 
@@ -37,9 +38,8 @@ diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
 ifneq ($(DEB_CROSS),yes)
 # all archs for which to create b-d's
 any_archs = alpha amd64 armel armhf arm64 i386 mips mipsel mips64 mips64el powerpc ppc64 ppc64el m68k sh4 sparc64 s390x x32
-diff -Nur debian/rules.defs /home/syuu/gcc-5-5.4.1/debian/rules.defs
--- debian/rules.defs	2016-10-14 04:54:21.000000000 +0000
-+++ /home/syuu/gcc-5-5.4.1/debian/rules.defs	2016-10-13 10:18:51.647631508 +0000
+--- debian/rules.defs	2017-02-24 19:02:52.000000000 +0000
+++ /home/syuu/gcc-5.5/gcc-5-5.4.1/debian/rules.defs	2017-02-24 18:13:59.000000000 +0000
@@ -412,7 +412,7 @@
 # gcc versions (fixincludes, libgcj-common) ...
 #with_common_pkgs := yes
--- a/dist/debian/dep/debian-stretch-source.list
+++ b/dist/debian/dep/debian-stretch-source.list
@@ -1,2 +0,0 @@
-deb-src http://httpredir.debian.org/debian stretch main
-deb-src http://httpredir.debian.org/debian stretch-updates main
--- a/dist/debian/rules.in
+++ b/dist/debian/rules.in
@@ -11,7 +11,8 @@ override_dh_auto_clean:

 override_dh_installinit:
 	dh_installinit --no-start @@DH_INSTALLINIT@@
-	dh_installinit --no-start --name scylla-housekeeping @@DH_INSTALLINIT@@
+	dh_installinit --no-start --name scylla-housekeeping-daily @@DH_INSTALLINIT@@
+	dh_installinit --no-start --name scylla-housekeeping-restart @@DH_INSTALLINIT@@
 	dh_installinit --no-start --name node-exporter @@DH_INSTALLINIT@@

 override_dh_strip:
--- a/dist/debian/scylla-server.install.in
+++ b/dist/debian/scylla-server.install.in
@@ -15,6 +15,7 @@ build/release/iotune usr/bin
 dist/common/bin/scyllatop usr/bin
 dist/common/sbin/* usr/sbin
@@ADDHKCFG@@
-@@HKDOTTIMER@@
+@@HKDOTTIMER_D@@
+@@HKDOTTIMER_R@@
@@INSTALL@@
@@SYSCTL@@
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -7,7 +7,7 @@ ENV container docker
 VOLUME [ "/sys/fs/cgroup" ]

 #install scylla
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-1.7.repo -o /etc/yum.repos.d/scylla.repo
 RUN yum -y install epel-release
 RUN yum -y clean expire-cache
 RUN yum -y update
@@ -38,6 +38,6 @@ ADD commandlineparser.py /commandlineparser.py
 ADD docker-entrypoint.py /docker-entrypoint.py
 ENTRYPOINT ["/docker-entrypoint.py"]

-EXPOSE 10000 9042 9160 7000 7001
+EXPOSE 10000 9042 9160 9180 7000 7001
 VOLUME [ "/var/lib/scylla" ]
 RUN chown -R scylla.scylla /var/lib/scylla
--- a/dist/redhat/build_rpm.sh
+++ b/dist/redhat/build_rpm.sh
@@ -74,7 +74,7 @@ if [ "$ID" = "centos" ] || [ "$ID" = "rhel" ]; then
        ./dist/redhat/centos_dep/build_dependency.sh
    else
        if [ "$ID" = "centos" ]; then
-            sudo curl https://s3.amazonaws.com/downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
+            sudo curl http://downloads.scylladb.com.s3.amazonaws.com/rpm/centos/scylla-1.7.repo -o /etc/yum.repos.d/scylla.repo
        else
            echo "RHEL requires --rebuild-deps option."
            exit 1
--- a/dist/redhat/centos_dep/build_dependency.sh
+++ b/dist/redhat/centos_dep/build_dependency.sh
@@ -28,10 +28,6 @@ if [ ! -f boost-1.58.0-11.fc23.src.rpm ]; then
    wget -nv https://kojipkgs.fedoraproject.org//packages/boost/1.58.0/11.fc23/src/boost-1.58.0-11.fc23.src.rpm
 fi

-if [ ! -f ninja-build-1.6.0-2.fc23.src.rpm ]; then
-    wget -nv https://kojipkgs.fedoraproject.org//packages/ninja-build/1.6.0/2.fc23/src/ninja-build-1.6.0-2.fc23.src.rpm
-fi
-
 if [ ! -f ragel-6.8-5.fc23.src.rpm ]; then
   wget -nv https://kojipkgs.fedoraproject.org//packages/ragel/6.8/5.fc23/src/ragel-6.8-5.fc23.src.rpm
 fi
@@ -94,13 +90,6 @@ if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-boost-1.58.0-11.el7*.x86_64.rpm ]; then
 fi
 do_install scylla-boost*

-if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-ninja-build-1.6.0-2.el7*.x86_64.rpm ]; then
-   rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ninja-build-1.6.0-2.fc23.src.rpm
-   patch $RPMBUILD/SPECS/ninja-build.spec < dist/redhat/centos_dep/ninja-build.diff
-   rpmbuild --define "_topdir $RPMBUILD" -ba $RPMBUILD/SPECS/ninja-build.spec
-fi
-do_install scylla-ninja-build-1.6.0-2.el7*.x86_64.rpm
-
 if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-ragel-6.8-5.el7*.x86_64.rpm ]; then
    rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ragel-6.8-5.fc23.src.rpm
    patch $RPMBUILD/SPECS/ragel.spec < dist/redhat/centos_dep/ragel.diff
--- a/dist/redhat/centos_dep/ninja-build.diff
+++ b/dist/redhat/centos_dep/ninja-build.diff
@@ -1,56 +0,0 @@
--- ninja-build.spec.orig	2016-01-20 14:41:16.892802134 +0000
-+++ ninja-build.spec	2016-01-20 14:44:42.453227192 +0000
-@@ -1,19 +1,18 @@
-Name:           ninja-build
-+Name:           scylla-ninja-build
- Version:        1.6.0
- Release:        2%{?dist}
- Summary:        A small build system with a focus on speed
- License:        ASL 2.0
- URL:            http://martine.github.com/ninja/
- Source0:        https://github.com/martine/ninja/archive/v%{version}.tar.gz#/ninja-%{version}.tar.gz
-Source1:        ninja.vim
- # Rename mentions of the executable name to be ninja-build.
- Patch1000:      ninja-1.6.0-binary-rename.patch
-+Requires:	scylla-env
- BuildRequires:  asciidoc
- BuildRequires:  gtest-devel
- BuildRequires:  python2-devel
-BuildRequires:  re2c >= 0.11.3
-Requires:       emacs-filesystem
-Requires:       vim-filesystem
-+#BuildRequires:  scylla-re2c >= 0.11.3
-+%define _prefix /opt/scylladb
- 
- %description
- Ninja is a small build system with a focus on speed. It differs from other
-@@ -32,15 +31,8 @@
- ./ninja -v ninja_test
- 
- %install
-# TODO: Install ninja_syntax.py?
-mkdir -p %{buildroot}/{%{_bindir},%{_datadir}/bash-completion/completions,%{_datadir}/emacs/site-lisp,%{_datadir}/vim/vimfiles/syntax,%{_datadir}/vim/vimfiles/ftdetect,%{_datadir}/zsh/site-functions}
-
-+mkdir -p %{buildroot}/opt/scylladb/bin
- install -pm755 ninja %{buildroot}%{_bindir}/ninja-build
-install -pm644 misc/bash-completion %{buildroot}%{_datadir}/bash-completion/completions/ninja-bash-completion
-install -pm644 misc/ninja-mode.el %{buildroot}%{_datadir}/emacs/site-lisp/ninja-mode.el
-install -pm644 misc/ninja.vim %{buildroot}%{_datadir}/vim/vimfiles/syntax/ninja.vim
-install -pm644 %{SOURCE1} %{buildroot}%{_datadir}/vim/vimfiles/ftdetect/ninja.vim
-install -pm644 misc/zsh-completion %{buildroot}%{_datadir}/zsh/site-functions/_ninja
- 
- %check
- # workaround possible too low default limits
-@@ -50,12 +42,6 @@
- %files
- %doc COPYING HACKING.md README doc/manual.html
- %{_bindir}/ninja-build
-%{_datadir}/bash-completion/completions/ninja-bash-completion
-%{_datadir}/emacs/site-lisp/ninja-mode.el
-%{_datadir}/vim/vimfiles/syntax/ninja.vim
-%{_datadir}/vim/vimfiles/ftdetect/ninja.vim
-# zsh does not have a -filesystem package
-%{_datadir}/zsh/
- 
- %changelog
- * Mon Nov 16 2015 Ben Boeckel <mathstuf@gmail.com> - 1.6.0-2
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{name}-@@VERSION@@-@@RELEASE@@.tar
-Requires:       scylla-server scylla-jmx scylla-tools scylla-kernel-conf
+Requires:       scylla-server = @@VERSION@@ scylla-jmx = @@VERSION@@ scylla-tools = @@VERSION@@ scylla-kernel-conf = @@VERSION@@
 Obsoletes:	scylla-server < 1.1

 %description
@@ -27,9 +27,9 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler libunwind-devel systemtap-sdt-devel
-%{?fedora:BuildRequires: boost-devel ninja-build ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
-%{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-boost-static scylla-ninja-build scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
+BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler libunwind-devel systemtap-sdt-devel libtool automake ninja-build
+%{?fedora:BuildRequires: boost-devel ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
+%{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-boost-static scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
 Requires:       scylla-conf systemd-libs hwloc collectd PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils
 %{?rhel:Requires: python34 python34-PyYAML}
 Conflicts:      abrt
@@ -53,6 +53,10 @@ python3.4 ./configure.py --disable-xen --enable-dpdk --mode=release --static-std
 ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
 cp dist/common/systemd/scylla-server.service.in build/scylla-server.service
 sed -i -e "s#@@SYSCONFDIR@@#/etc/sysconfig#g" build/scylla-server.service
+cp dist/common/systemd/scylla-housekeeping-restart.service.in build/scylla-housekeeping-restart.service
+sed -i -e "s#@@REPOFILES@@#'/etc/yum.repos.d/scylla*.repo'#g" build/scylla-housekeeping-restart.service
+cp dist/common/systemd/scylla-housekeeping-daily.service.in build/scylla-housekeeping-daily.service
+sed -i -e "s#@@REPOFILES@@#'/etc/yum.repos.d/scylla*.repo'#g" build/scylla-housekeeping-daily.service

 %install
 rm -rf $RPM_BUILD_ROOT
@@ -63,6 +67,9 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
+%if 0%{?rhel}
+mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
+%endif
 mkdir -p $RPM_BUILD_ROOT%{_sysctldir}/
 mkdir -p $RPM_BUILD_ROOT%{_docdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_unitdir}
@@ -73,6 +80,9 @@ install -m644 dist/common/limits.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/sec
 install -m644 dist/common/collectd.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 install -m644 dist/common/scylla.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
 install -m644 dist/common/sysctl.d/*.conf $RPM_BUILD_ROOT%{_sysctldir}/
+%if 0%{?rhel}
+install -m644 dist/common/modprobe.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
+%endif
 install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
 install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
@@ -151,10 +161,8 @@ rm -rf $RPM_BUILD_ROOT
 %{_docdir}/scylla/NOTICE.txt
 %{_docdir}/scylla/ORIGIN
 %{_docdir}/scylla/licenses/
-%{_unitdir}/scylla-server.service
-%{_unitdir}/scylla-housekeeping.service
-%{_unitdir}/scylla-housekeeping.timer
-%{_unitdir}/node-exporter.service
+%{_unitdir}/*.service
+%{_unitdir}/*.timer
 %{_bindir}/scylla
 %{_bindir}/iotune
 %{_bindir}/scyllatop
@@ -228,6 +236,7 @@ Group:          Applications/Databases
 Summary:        Scylla configuration package for the Linux kernel
 License:        AGPLv3
 URL:            http://www.scylladb.com/
+Requires:       kmod

 %description kernel-conf
 This package contains Linux kernel configuration changes for the Scylla database.  Install this package
@@ -237,9 +246,18 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
+# Write modprobe.d params when module already loaded
+%if 0%{?rhel}
+if [ -e /sys/module/raid0/parameters/devices_discard_performance ]; then
+    echo Y > /sys/module/raid0/parameters/devices_discard_performance
+fi
+%endif

 %files kernel-conf
 %defattr(-,root,root)
+%if 0%{?rhel}
+%config(noreplace) %{_sysconfdir}/modprobe.d/*.conf
+%endif
 %{_sysctldir}/*.conf

 %changelog
--- a/gc_clock.hh
+++ b/gc_clock.hh
@@ -50,6 +50,12 @@ public:
    // for real time waits.
 };

+// Returns a time point which is earlier from t by d, or minimum time point if it cannot be represented.
+template<typename Clock, typename Duration, typename Rep, typename Period>
+inline
+auto saturating_subtract(std::chrono::time_point<Clock, Duration> t, std::chrono::duration<Rep, Period> d) -> decltype(t) {
+    return std::max(t, decltype(t)::min() + d) - d;
+}

 using expiry_opt = std::experimental::optional<gc_clock::time_point>;
 using ttl_opt = std::experimental::optional<gc_clock::duration>;
--- a/gms/failure_detector.cc
+++ b/gms/failure_detector.cc
@@ -43,6 +43,7 @@
 #include "gms/endpoint_state.hh"
 #include "gms/application_state.hh"
 #include "gms/inet_address.hh"
+#include "service/storage_service.hh"
 #include "log.hh"
 #include <iostream>
 #include <chrono>
@@ -56,37 +57,13 @@ constexpr std::chrono::milliseconds failure_detector::DEFAULT_MAX_PAUSE;
 using clk = arrival_window::clk;

 static clk::duration get_initial_value() {
-#if 0
-    String newvalue = System.getProperty("cassandra.fd_initial_value_ms");
-    if (newvalue == null)
-    {
-        return Gossiper.intervalInMillis * 2;
-    }
-    else
-    {
-        logger.info("Overriding FD INITIAL_VALUE to {}ms", newvalue);
-        return Integer.parseInt(newvalue);
-    }
-#endif
-    warn(unimplemented::cause::GOSSIP);
-    return std::chrono::seconds(2);
+    auto& cfg = service::get_local_storage_service().db().local().get_config();
+    return std::chrono::milliseconds(cfg.fd_initial_value_ms());
 }

 clk::duration arrival_window::get_max_interval() {
-#if 0
-    sstring newvalue = System.getProperty("cassandra.fd_max_interval_ms");
-    if (newvalue == null)
-    {
-        return failure_detector.INITIAL_VALUE_NANOS;
-    }
-    else
-    {
-        logger.info("Overriding FD MAX_INTERVAL to {}ms", newvalue);
-        return TimeUnit.NANOSECONDS.convert(Integer.parseInt(newvalue), TimeUnit.MILLISECONDS);
-    }
-#endif
-    warn(unimplemented::cause::GOSSIP);
-    return get_initial_value();
+    auto& cfg = service::get_local_storage_service().db().local().get_config();
+    return std::chrono::milliseconds(cfg.fd_max_interval_ms());
 }

 void arrival_window::add(clk::time_point value, const gms::inet_address& ep) {
@@ -95,7 +72,7 @@ void arrival_window::add(clk::time_point value, const gms::inet_address& ep) {
        if (inter_arrival_time <= get_max_interval()) {
            _arrival_intervals.add(inter_arrival_time.count());
        } else  {
-            logger.debug("failure_detector: Ignoring interval time of {} for {}", inter_arrival_time.count(), ep);
+            logger.debug("failure_detector: Ignoring interval time of {} for {}, mean={}, size={}", inter_arrival_time.count(), ep, mean(), size());
        }
    } else {
        // We use a very large initial interval since the "right" average depends on the cluster size
--- a/gms/failure_detector.hh
+++ b/gms/failure_detector.hh
@@ -87,6 +87,8 @@ public:
    // see CASSANDRA-2597 for an explanation of the math at work here.
    double phi(clk::time_point tnow);

+    size_t size() { return _arrival_intervals.size(); }
+
    friend std::ostream& operator<<(std::ostream& os, const arrival_window& w);

 };
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -480,7 +480,7 @@ void gossiper::remove_endpoint(inet_address endpoint) {
        logger.info("removed {} from _seeds, updated _seeds list = {}", endpoint, _seeds);
    }

-    _live_endpoints.erase(endpoint);
+    _live_endpoints.erase(std::remove(_live_endpoints.begin(), _live_endpoints.end(), endpoint), _live_endpoints.end());
    _live_endpoints_just_added.remove(endpoint);
    _unreachable_endpoints.erase(endpoint);
    quarantine_endpoint(endpoint);
@@ -567,10 +567,36 @@ void gossiper::run() {

                _gossiped_to_seed = false;

-                /* Gossip to some random live member */
-                do_gossip_to_live_member(message).handle_exception([] (auto ep) {
-                    logger.trace("Faill to do_gossip_to_live_member: {}", ep);
-                });
+                auto get_random_node = [this] (const std::vector<inet_address>& nodes) {
+                    std::uniform_int_distribution<int> dist(0, nodes.size() - 1);
+                    int index = dist(this->_random);
+                    return nodes[index];
+                };
+
+                /* Gossip to some random live members */
+                // TODO: For now, we choose 10th of all the nodes in the cluster.
+                auto nr_live_nodes = std::max(size_t(1), endpoint_state_map.size() / 10);
+                nr_live_nodes = std::min(nr_live_nodes, _live_endpoints.size());
+                std::unordered_set<gms::inet_address> live_nodes;
+                logger.debug("nr_live_nodes={}, endpoint_state_map.size()={}, live_endpoints.size={}",
+                    nr_live_nodes, endpoint_state_map.size(), _live_endpoints.size());
+                while (live_nodes.size() < nr_live_nodes && nr_live_nodes <= _live_endpoints.size()) {
+                    if (!_live_endpoints_just_added.empty()) {
+                        auto ep = _live_endpoints_just_added.front();
+                        _live_endpoints_just_added.pop_front();
+                        logger.info("Favor newly added node {}", ep);
+                        live_nodes.insert(ep);
+                    } else {
+                        // Get a random live node
+                        live_nodes.insert(get_random_node(_live_endpoints));
+                    }
+                }
+                logger.debug("Talk to {} live nodes: {}", nr_live_nodes, live_nodes);
+                for (auto& ep: live_nodes) {
+                    do_gossip_to_live_member(message, ep).handle_exception([] (auto ep) {
+                        logger.trace("Failed to do_gossip_to_live_member: {}", ep);
+                    });
+                }

                /* Gossip to some unreachable member with some probability to check if he is back up */
                do_gossip_to_unreachable_member(message).handle_exception([] (auto ep) {
@@ -695,7 +721,7 @@ void gossiper::unregister_(shared_ptr<i_endpoint_state_change_subscriber> subscr
 }

 std::set<inet_address> gossiper::get_live_members() {
-    std::set<inet_address> live_members(_live_endpoints);
+    std::set<inet_address> live_members(_live_endpoints.begin(), _live_endpoints.end());
    if (!live_members.count(get_broadcast_address())) {
        live_members.insert(get_broadcast_address());
    }
@@ -952,19 +978,8 @@ future<int> gossiper::get_current_heart_beat_version(inet_address endpoint) {
    });
 }

-future<> gossiper::do_gossip_to_live_member(gossip_digest_syn message) {
-    size_t size = _live_endpoints.size();
-    if (size == 0) {
-        return make_ready_future<>();
-    }
-    logger.trace("do_gossip_to_live_member: live_endpoint nr={}", _live_endpoints.size());
-    if (!_live_endpoints_just_added.empty()) {
-        auto ep = _live_endpoints_just_added.front();
-        _live_endpoints_just_added.pop_front();
-        logger.info("do_gossip_to_live_member: Favor newly added node {}", ep);
-        return send_gossip(message, std::set<inet_address>{ep});
-    }
-    return send_gossip(message, _live_endpoints);
+future<> gossiper::do_gossip_to_live_member(gossip_digest_syn message, gms::inet_address ep) {
+    return send_gossip(message, {ep});
 }

 future<> gossiper::do_gossip_to_unreachable_member(gossip_digest_syn message) {
@@ -1135,6 +1150,15 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
    //     real_mark_alive(addr, local_state);
    //     return;
    // }
+    auto inserted = _pending_mark_alive_endpoints.insert(addr).second;
+    if (inserted) {
+        // The node is not in the _pending_mark_alive_endpoints
+        logger.debug("Mark Node {} alive with EchoMessage", addr);
+    } else {
+        // We are in the progress of marking this node alive
+        logger.debug("Node {} is being marked as up, ignoring duplicated mark alive operation", addr);
+        return;
+    }

    local_state.mark_dead();
    msg_addr id = get_msg_addr(addr);
@@ -1143,10 +1167,22 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
        ms().send_gossip_echo(id).get();
        logger.trace("Got EchoMessage Reply");
        set_last_processed_message_at();
-        real_mark_alive(id.addr, local_state);
+        // After sending echo message, the Node might not be in the
+        // endpoint_state_map anymore, use the reference of local_state
+        // might cause user-after-free
+        auto it = endpoint_state_map.find(addr);
+        if (it == endpoint_state_map.end()) {
+            logger.info("Node {} is not in endpoint_state_map anymore", addr);
+        } else {
+            endpoint_state& state = it->second;
+            logger.debug("Mark Node {} alive after EchoMessage", addr);
+            real_mark_alive(addr, state);
+        }
    } catch(...) {
        logger.warn("Fail to send EchoMessage to {}: {}", id, std::current_exception());
    }
+
+    _pending_mark_alive_endpoints.erase(addr);
 }

 // Runs inside seastar::async context
@@ -1154,7 +1190,10 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as alive {}", addr);
    local_state.mark_alive();
    local_state.update_timestamp(); // prevents do_status_check from racing us and evicting if it was down > A_VERY_LONG_TIME
-    _live_endpoints.insert(addr);
+    auto it_ = std::find(_live_endpoints.begin(), _live_endpoints.end(), addr);
+    if (it_ == _live_endpoints.end()) {
+        _live_endpoints.push_back(addr);
+    }
    auto it = std::find(_live_endpoints_just_added.begin(), _live_endpoints_just_added.end(), addr);
    if (it == _live_endpoints_just_added.end()) {
        _live_endpoints_just_added.push_back(addr);
@@ -1176,7 +1215,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
 void gossiper::mark_dead(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as down {}", addr);
    local_state.mark_dead();
-    _live_endpoints.erase(addr);
+    _live_endpoints.erase(std::remove(_live_endpoints.begin(), _live_endpoints.end(), addr), _live_endpoints.end());
    _live_endpoints_just_added.remove(addr);
    _unreachable_endpoints[addr] = now();
    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(local_state));
@@ -1188,10 +1227,7 @@ void gossiper::mark_dead(inet_address addr, endpoint_state& local_state) {

 // Runs inside seastar::async context
 void gossiper::handle_major_state_change(inet_address ep, const endpoint_state& eps) {
-    std::experimental::optional<endpoint_state> local_ep_state;
-    if (endpoint_state_map.count(ep) > 0) {
-        local_ep_state = endpoint_state_map.at(ep);
-    }
+    auto eps_old = get_endpoint_state_for_endpoint(ep);
    if (!is_dead_state(eps) && !_in_shadow_round) {
        if (endpoint_state_map.count(ep))  {
            logger.debug("Node {} has restarted, now UP, status = {}", ep, get_gossip_status(eps));
@@ -1202,24 +1238,37 @@ void gossiper::handle_major_state_change(inet_address ep, const endpoint_state&
    logger.trace("Adding endpoint state for {}, status = {}", ep, get_gossip_status(eps));
    endpoint_state_map[ep] = eps;

-    auto& ep_state = endpoint_state_map.at(ep);
+    if (_in_shadow_round) {
+        // In shadow round, we only interested in the peer's endpoint_state,
+        // e.g., gossip features, host_id, tokens. No need to call the
+        // on_restart or on_join callbacks or to go through the mark alive
+        // procedure with EchoMessage gossip message. We will do them during
+        // normal gossip runs anyway.
+        logger.debug("In shadow round addr={}, eps={}", ep, eps);
+        return;
+    }

-    if (local_ep_state) {
+    if (eps_old) {
        // the node restarted: it is up to the subscriber to take whatever action is necessary
-        _subscribers.for_each([ep, local_ep_state] (auto& subscriber) {
-            subscriber->on_restart(ep, *local_ep_state);
+        _subscribers.for_each([ep, eps_old] (auto& subscriber) {
+            subscriber->on_restart(ep, *eps_old);
        });
    }

+    auto& ep_state = endpoint_state_map.at(ep);
    if (!is_dead_state(ep_state)) {
        mark_alive(ep, ep_state);
    } else {
        logger.debug("Not marking {} alive due to dead state {}", ep, get_gossip_status(eps));
        mark_dead(ep, ep_state);
    }
-    _subscribers.for_each([ep, ep_state] (auto& subscriber) {
-        subscriber->on_join(ep, ep_state);
-    });
+
+    auto eps_new = get_endpoint_state_for_endpoint(ep);
+    if (eps_new) {
+        _subscribers.for_each([ep, eps_new] (auto& subscriber) {
+            subscriber->on_join(ep, *eps_new);
+        });
+    }
    // check this at the end so nodes will learn about the endpoint
    if (is_shutdown(ep)) {
        mark_as_shutdown(ep);
@@ -1240,6 +1289,10 @@ bool gossiper::is_shutdown(const inet_address& endpoint) const {
    return get_gossip_status(endpoint) == sstring(versioned_value::SHUTDOWN);
 }

+bool gossiper::is_normal(const inet_address& endpoint) const {
+    return get_gossip_status(endpoint) == sstring(versioned_value::STATUS_NORMAL);
+}
+
 bool gossiper::is_silent_shutdown_state(const endpoint_state& ep_state) const{
    sstring state = get_gossip_status(ep_state);
    for (auto& deadstate : SILENT_SHUTDOWN_STATES) {
@@ -1394,9 +1447,11 @@ future<> gossiper::start_gossiping(int generation_nbr, std::map<application_stat
            local_state.add_application_state(entry.first, entry.second);
        }

+        auto generation = local_state.get_heart_beat_state().get_generation();
+
        //notify snitches that Gossiper is about to start
-        return locator::i_endpoint_snitch::get_local_snitch_ptr()->gossiper_starting().then([this, &local_state] {
-            logger.trace("gossip started with generation {}", local_state.get_heart_beat_state().get_generation());
+        return locator::i_endpoint_snitch::get_local_snitch_ptr()->gossiper_starting().then([this, generation] {
+            logger.trace("gossip started with generation {}", generation);
            _enabled = true;
            _nr_run = 0;
            _scheduled_gossip_task.arm(INTERVAL);
@@ -1493,16 +1548,19 @@ future<> gossiper::add_local_application_state(application_state state, versione
                logger.error(err.c_str());
                throw std::runtime_error(err);
            }
-            endpoint_state& ep_state = gossiper.endpoint_state_map.at(ep_addr);
+            endpoint_state ep_state_before = gossiper.endpoint_state_map.at(ep_addr);
            // Fire "before change" notifications:
-            gossiper.do_before_change_notifications(ep_addr, ep_state, state, value);
+            gossiper.do_before_change_notifications(ep_addr, ep_state_before, state, value);
            // Notifications may have taken some time, so preventively raise the version
            // of the new value, otherwise it could be ignored by the remote node
            // if another value with a newer version was received in the meantime:
            value = storage_service_value_factory().clone_with_higher_version(value);
            // Add to local application state and fire "on change" notifications:
-            ep_state.add_application_state(state, value);
-            gossiper.do_on_change_notifications(ep_addr, state, value);
+            if (gossiper.endpoint_state_map.count(ep_addr)) {
+                auto& ep_state = gossiper.endpoint_state_map.at(ep_addr);
+                ep_state.add_application_state(state, value);
+                gossiper.do_on_change_notifications(ep_addr, state, value);
+            }
        }).handle_exception([] (auto ep) {
            logger.warn("Fail to apply application_state: {}", ep);
        });
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -184,9 +184,12 @@ private:
    } _subscribers;

    /* live member set */
-    std::set<inet_address> _live_endpoints;
+    std::vector<inet_address> _live_endpoints;
    std::list<inet_address> _live_endpoints_just_added;

+    /* nodes are being marked as alive */
+    std::unordered_set<inet_address> _pending_mark_alive_endpoints;
+
    /* unreachable member set */
    std::map<inet_address, clk::time_point> _unreachable_endpoints;

@@ -206,7 +209,7 @@ private:
    clk::time_point _last_processed_message_at = now();

    std::map<inet_address, clk::time_point> _shadow_unreachable_endpoints;
-    std::set<inet_address> _shadow_live_endpoints;
+    std::vector<inet_address> _shadow_live_endpoints;

    void run();
 public:
@@ -366,8 +369,8 @@ private:
     */
    future<> send_gossip(gossip_digest_syn message, std::set<inet_address> epset);

-    /* Sends a Gossip message to a live member and returns true if the recipient was a seed */
-    future<> do_gossip_to_live_member(gossip_digest_syn message);
+    /* Sends a Gossip message to a live member */
+    future<> do_gossip_to_live_member(gossip_digest_syn message, inet_address ep);

    /* Sends a Gossip message to an unreachable member */
    future<> do_gossip_to_unreachable_member(gossip_digest_syn message);
@@ -501,6 +504,7 @@ public:
    void debug_show();
 public:
    bool is_shutdown(const inet_address& endpoint) const;
+    bool is_normal(const inet_address& endpoint) const;
    bool is_silent_shutdown_state(const endpoint_state& ep_state) const;
    void mark_as_shutdown(const inet_address& endpoint);
    void force_newer_generation();
--- a/idl-compiler.py
+++ b/idl-compiler.py
@@ -277,6 +277,9 @@ def is_optional(lst):

 created_writers = set()

+def get_member_name(name):
+    return name if not name.endswith('()') else name[:-2]
+
 def get_members(cls):
    return [p for p in cls["members"] if not is_class(p) and not is_enum(p)]

@@ -456,18 +459,19 @@ def add_param_writer_object(name, base_state, typ, var_type = "", var_index = No
 def add_param_write(current, base_state, vector = False, root_node = False):
    typ = current["type"]
    res = ""
+    name = get_member_name(current["name"])
    if is_basic_type(typ):
-        res = res + add_param_writer_basic_type(current["name"], base_state, typ)
+        res = res + add_param_writer_basic_type(name, base_state, typ)
    elif is_optional(typ):
            res = res +  Template(reindent(4, """
    after_${basestate}__$name<Output> skip_$name() && {
        serialize(_out, false);
        return { _out, std::move(_state) };
-    }""")).substitute({'type': param_type(typ), 'name': current["name"], 'basestate' : base_state})
+    }""")).substitute({'type': param_type(typ), 'name': name, 'basestate' : base_state})
            if is_basic_type(typ[1][0]):
-                res = res + add_param_writer_basic_type(current["name"], base_state, typ[1][0], "", "true")
+                res = res + add_param_writer_basic_type(name, base_state, typ[1][0], "", "true")
            elif is_local_type(typ[1][0]):
-                res = res + add_param_writer_object(current["name"], base_state[0][1], typ, "", "true")
+                res = res + add_param_writer_object(name, base_state[0][1], typ, "", "true")
            else:
                print("non supported optional type ", type[0][1])
    elif is_vector(typ):
@@ -482,18 +486,18 @@ def add_param_write(current, base_state, vector = False, root_node = False):
        $set
        return { _out, std::move(_state) };
    }
-""").substitute({'type': param_type(typ), 'name': current["name"], 'basestate' : base_state, 'set' : set_size})
+""").substitute({'type': param_type(typ), 'name': name, 'basestate' : base_state, 'set' : set_size})
    elif is_local_type(typ):
-        res = res + add_param_writer_object(current["name"], base_state, typ)
+        res = res + add_param_writer_object(name, base_state, typ)
    elif is_variant(typ):
        for idx, p in enumerate(typ[1]):
            if is_basic_type(p):
                varient_type = param_type(p)
-                res = res + add_param_writer_basic_type(current["name"], base_state, varient_type,"_" + varient_type, idx, root_node)
+                res = res + add_param_writer_basic_type(name, base_state, varient_type,"_" + varient_type, idx, root_node)
            elif is_variant(p):
-                res = res + add_param_writer_object(current["name"], base_state, p, '_' + "variant", idx, root_node)
+                res = res + add_param_writer_object(name, base_state, p, '_' + "variant", idx, root_node)
            elif is_local_type(p):
-                res = res + add_param_writer_object(current["name"], base_state, p, '_' + param_type(p), idx, root_node)
+                res = res + add_param_writer_object(name, base_state, p, '_' + param_type(p), idx, root_node)
    else:
        print ("something is wrong with type", typ)
    return res;
@@ -658,7 +662,7 @@ def handle_visitors_nodes(info, hout, variant_node = False, clases = []):
    if not members:
        add_node(hout, base_state_name, None, base_state_name, prefix, parents, add_end_method(parents, current_name, variant_node, clases), False, is_final(cls))
        return
-    add_node(hout, base_state_name + "__" + members[-1]["name"], members[-1]["type"], base_state_name, "after_", base_state_name, add_end_method(parents, current_name, variant_node, clases))
+    add_node(hout, base_state_name + "__" + get_member_name(members[-1]["name"]), members[-1]["type"], base_state_name, "after_", base_state_name, add_end_method(parents, current_name, variant_node, clases))
    # Create writer and reader for include class
    if not variant_node:
        for member in get_dependency(cls):
@@ -666,9 +670,9 @@ def handle_visitors_nodes(info, hout, variant_node = False, clases = []):
    for ind in reversed(range(1, len(members))):
        member = members[ind]
        add_nodes_when_needed(hout, info, member, base_state_name, parents, member_classes)
-        variant_state = base_state_name + "__" + member["name"] if is_variant(member["type"]) else base_state_name
+        variant_state = base_state_name + "__" + get_member_name(member["name"]) if is_variant(member["type"]) else base_state_name
        is_param_vector = is_vector(member["type"]) and  is_basic_type(member["type"][1][0])
-        add_node(hout, base_state_name + "__" + members[ind - 1]["name"], member["type"], variant_state, "after_", base_state_name, add_param_write(member, base_state_name), False)
+        add_node(hout, base_state_name + "__" + get_member_name(members[ind - 1]["name"]), member["type"], variant_state, "after_", base_state_name, add_param_write(member, base_state_name), False)
    member = members[0]
    is_param_vector = is_vector(member["type"]) and is_basic_type(member["type"][1][0])
    add_nodes_when_needed(hout, info, member, base_state_name, parents, member_classes)
@@ -790,7 +794,7 @@ def add_view(hout, info):
               return deserialize(in, boost::type<$type>());
              });
            }
-        """)).substitute({'name' : m["name"], 'type' : full_type, 'skip' : skip}))
+        """)).substitute({'name' : get_member_name(m["name"]), 'type' : full_type, 'skip' : skip}))

        skip = skip + Template("\n       ser::skip(in, boost::type<${type}>());").substitute({'type': full_type})

--- a/idl/commitlog.idl.hh
+++ b/idl/commitlog.idl.hh
@@ -19,7 +19,7 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-class commitlog_entry {
+class commitlog_entry [[writable]] {
    std::experimental::optional<column_mapping> mapping();
    frozen_mutation mutation();
-};
+};
--- a/memtable.cc
+++ b/memtable.cc
@@ -65,17 +65,15 @@ future<> memtable::clear_gently() noexcept {
        auto t = std::make_unique<seastar::thread>(attr, [this] {
            auto& alloc = allocator();

-            // entries can no longer be moved after unlink_leftmost_without_rebalance()
-            // so need to disable compaction.
-            logalloc::reclaim_lock rl(*this);
-
            auto p = std::move(partitions);
            while (!p.empty()) {
                auto batch_size = std::min<size_t>(p.size(), 32);
                auto dirty_before = dirty_size();
                with_allocator(alloc, [&] () noexcept {
                    while (batch_size--) {
-                        alloc.destroy(p.unlink_leftmost_without_rebalance());
+                        p.erase_and_dispose(p.begin(), [&] (auto e) {
+                            alloc.destroy(e);
+                        });
                    }
                });
                remove_flushed_memory(dirty_before - dirty_size());
@@ -205,19 +203,23 @@ protected:
        , _range(&range)
    { }

-    memtable_entry* fetch_next_entry() {
+    memtable_entry* fetch_entry() {
        update_iterators();
        if (_i == _end) {
            return nullptr;
        } else {
            memtable_entry& e = *_i;
-            ++_i;
-            _last = e.key();
            _memtable->upgrade_entry(e);
            return &e;
        }
    }

+    void advance() {
+        memtable_entry& e = *_i;
+        _last = e.key();
+        ++_i;
+    }
+
    logalloc::allocating_section& read_section() {
        return _memtable->_read_section;
    }
@@ -244,9 +246,10 @@ protected:

    mutation_reader delegate_reader(const dht::partition_range& delegate,
                                    const query::partition_slice& slice,
-                                    const io_priority_class& pc) {
+                                    const io_priority_class& pc,
+                                    mutation_reader::forwarding fwd_mr) {
        auto ret = make_mutation_reader<sstable_range_wrapping_reader>(
-            _memtable->_sstable, _schema, delegate, slice, pc);
+            _memtable->_sstable, _schema, delegate, slice, pc, fwd_mr);
        _memtable = {};
        _last = {};
        return ret;
@@ -264,15 +267,18 @@ class scanning_reader final: public iterator_reader {
    mutation_reader _delegate;
    const io_priority_class& _pc;
    const query::partition_slice& _slice;
+    mutation_reader::forwarding _fwd_mr;
 public:
     scanning_reader(schema_ptr s,
                     lw_shared_ptr<memtable> m,
                     const dht::partition_range& range,
                     const query::partition_slice& slice,
-                     const io_priority_class& pc)
+                     const io_priority_class& pc,
+                     mutation_reader::forwarding fwd_mr)
         : iterator_reader(std::move(s), std::move(m), range)
         , _pc(pc)
         , _slice(slice)
+         , _fwd_mr(fwd_mr)
     { }

    virtual future<streamed_mutation_opt> operator()() override {
@@ -283,18 +289,22 @@ public:
        // FIXME: Use cache. See column_family::make_reader().
        _delegate_range = get_delegate_range();
        if (_delegate_range) {
-            _delegate = delegate_reader(*_delegate_range, _slice, _pc);
+            _delegate = delegate_reader(*_delegate_range, _slice, _pc, _fwd_mr);
            return _delegate();
        }

-        logalloc::reclaim_lock _(region());
-        managed_bytes::linearization_context_guard lcg;
-        memtable_entry* e = fetch_next_entry();
-        if (!e) {
-             return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
-        } else {
-            return make_ready_future<streamed_mutation_opt>(e->read(mtbl(), schema(), _slice));
-        }
+        return read_section()(region(), [&] {
+            return with_linearized_managed_bytes([&] {
+                memtable_entry* e = fetch_entry();
+                if (!e) {
+                    return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
+                } else {
+                    auto ret =  make_ready_future<streamed_mutation_opt>(e->read(mtbl(), schema(), _slice));
+                    advance();
+                    return ret;
+                }
+            });
+        });
    }
 };

@@ -391,19 +401,24 @@ public:
    flush_reader& operator=(const flush_reader&) = delete;

    virtual future<streamed_mutation_opt> operator()() override {
-        logalloc::reclaim_lock _(region());
-        managed_bytes::linearization_context_guard lcg;
-        memtable_entry* e = fetch_next_entry();
-        if (!e) {
-            return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
-        } else {
-            auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), query::full_slice, e->key().key());
-            auto snp = e->partition().read(schema());
-            auto mpsr = make_partition_snapshot_reader<partition_snapshot_accounter>(schema(), e->key(), std::move(cr), snp, region(), read_section(), mtbl(), _flushed_memory);
-            _flushed_memory.account_component(*e);
-            _flushed_memory.account_component(*snp);
-            return make_ready_future<streamed_mutation_opt>(std::move(mpsr));
-        }
+        return read_section()(region(), [&] {
+            return with_linearized_managed_bytes([&] {
+                memtable_entry* e = fetch_entry();
+                if (!e) {
+                    return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
+                } else {
+                    auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), query::full_slice, e->key().key());
+                    auto snp = e->partition().read(schema());
+                    auto mpsr = make_partition_snapshot_reader<partition_snapshot_accounter>(schema(), e->key(), std::move(cr),
+                            snp, region(), read_section(), mtbl(), _flushed_memory);
+                    _flushed_memory.account_component(*e);
+                    _flushed_memory.account_component(*snp);
+                    auto ret = make_ready_future<streamed_mutation_opt>(std::move(mpsr));
+                    advance();
+                    return ret;
+                }
+            });
+        });
    }
 };

@@ -411,7 +426,9 @@ mutation_reader
 memtable::make_reader(schema_ptr s,
                      const dht::partition_range& range,
                      const query::partition_slice& slice,
-                      const io_priority_class& pc) {
+                      const io_priority_class& pc,
+                      tracing::trace_state_ptr trace_state_ptr,
+                      mutation_reader::forwarding fwd_mr) {
    if (query::is_single_partition(range)) {
        const query::ring_position& pos = range.start()->value();
        return _read_section(*this, [&] {
@@ -425,7 +442,7 @@ memtable::make_reader(schema_ptr s,
        }
        });
    } else {
-        return make_mutation_reader<scanning_reader>(std::move(s), shared_from_this(), range, slice, pc);
+        return make_mutation_reader<scanning_reader>(std::move(s), shared_from_this(), range, slice, pc, fwd_mr);
    }
 }

@@ -434,7 +451,7 @@ memtable::make_flush_reader(schema_ptr s, const io_priority_class& pc) {
    if (group()) {
        return make_mutation_reader<flush_reader>(std::move(s), shared_from_this());
    } else {
-        return make_mutation_reader<scanning_reader>(std::move(s), shared_from_this(), query::full_partition_range, query::full_slice, pc);
+        return make_mutation_reader<scanning_reader>(std::move(s), shared_from_this(), query::full_partition_range, query::full_slice, pc, mutation_reader::forwarding::no);
    }
 }

@@ -486,8 +503,13 @@ logalloc::occupancy_stats memtable::occupancy() const {
 }

 mutation_source memtable::as_data_source() {
-    return mutation_source([mt = shared_from_this()] (schema_ptr s, const dht::partition_range& range) {
-        return mt->make_reader(std::move(s), range);
+    return mutation_source([mt = shared_from_this()] (schema_ptr s,
+            const dht::partition_range& range,
+            const query::partition_slice& slice,
+            const io_priority_class& pc,
+            tracing::trace_state_ptr trace_state,
+            mutation_reader::forwarding fwd_mr) {
+        return mt->make_reader(std::move(s), range, slice, pc, std::move(trace_state), fwd_mr);
    });
 }

--- a/memtable.hh
+++ b/memtable.hh
@@ -173,7 +173,9 @@ public:
    mutation_reader make_reader(schema_ptr,
                                const dht::partition_range& range = query::full_partition_range,
                                const query::partition_slice& slice = query::full_slice,
-                                const io_priority_class& pc = default_priority_class());
+                                const io_priority_class& pc = default_priority_class(),
+                                tracing::trace_state_ptr trace_state_ptr = nullptr,
+                                mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no);


    mutation_reader make_flush_reader(schema_ptr, const io_priority_class& pc);
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -274,7 +274,13 @@ void messaging_service::start_listen() {
        if (listen_to_bc) {
            _server_tls[1] = listen(utils::fb_utilities::get_broadcast_address());
        }
-
+    }
+    // Do this on just cpu 0, to avoid duplicate logs.
+    if (engine().cpu_id() == 0) {
+        if (_server_tls[0]) {
+            logger.info("Starting Encrypted Messaging Service on SSL port {}", _ssl_port);
+        }
+        logger.info("Starting Messaging Service on port {}", _port);
    }
 }

@@ -308,14 +314,6 @@ messaging_service::messaging_service(gms::inet_address ip
    if (listen_now) {
        start_listen();
    }
-
-    // Do this on just cpu 0, to avoid duplicate logs.
-    if (engine().cpu_id() == 0) {
-        if (_server_tls[0]) {
-            logger.info("Starting Encrypted Messaging Service on SSL port {}", _ssl_port);
-        }
-        logger.info("Starting Messaging Service on port {}", _port);
-    }
 }

 msg_addr messaging_service::get_source(const rpc::client_info& cinfo) {
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -123,7 +123,7 @@ public:
              uint32_t partition_limit, CompactedMutationsConsumer consumer)
        : _schema(s)
        , _query_time(query_time)
-        , _gc_before(query_time - s.gc_grace_seconds())
+        , _gc_before(saturating_subtract(query_time, s.gc_grace_seconds()))
        , _can_gc(always_gc)
        , _slice(slice)
        , _row_limit(limit)
@@ -139,7 +139,7 @@ public:
                     std::function<api::timestamp_type(const dht::decorated_key&)> get_max_purgeable)
        : _schema(s)
        , _query_time(compaction_time)
-        , _gc_before(_query_time - s.gc_grace_seconds())
+        , _gc_before(saturating_subtract(_query_time, s.gc_grace_seconds()))
        , _get_max_purgeable(std::move(get_max_purgeable))
        , _can_gc([this] (tombstone t) { return can_gc(t); })
        , _slice(query::full_slice)
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -1183,7 +1183,7 @@ uint32_t mutation_partition::do_compact(const schema& s,
 {
    assert(row_limit > 0);

-    auto gc_before = query_time - s.gc_grace_seconds();
+    auto gc_before = saturating_subtract(query_time, s.gc_grace_seconds());

    auto should_purge_tombstone = [&] (const tombstone& t) {
        return t.deletion_time < gc_before && can_gc(t);
@@ -1526,12 +1526,19 @@ bool row::compact_and_expire(const schema& s, column_kind kind, tombstone tomb,
        const column_definition& def = s.column_at(kind, id);
        if (def.is_atomic()) {
            atomic_cell_view cell = c.as_atomic_cell();
+            auto can_erase_cell = [&] {
+                return cell.deletion_time() < gc_before && can_gc(tombstone(cell.timestamp(), cell.deletion_time()));
+            };
+
            if (cell.is_covered_by(tomb, def.is_counter())) {
                erase = true;
            } else if (cell.has_expired(query_time)) {
-                c = atomic_cell::make_dead(cell.timestamp(), cell.deletion_time());
+                erase = can_erase_cell();
+                if (!erase) {
+                    c = atomic_cell::make_dead(cell.timestamp(), cell.deletion_time());
+                }
            } else if (!cell.is_live()) {
-                erase = cell.deletion_time() < gc_before && can_gc(tombstone(cell.timestamp(), cell.deletion_time()));
+                erase = can_erase_cell();
            } else {
                any_live |= true;
            }
--- a/mutation_partition_serializer.cc
+++ b/mutation_partition_serializer.cc
@@ -63,8 +63,14 @@ auto write_counter_cell(Writer&& writer, const atomic_cell& c)
            counter_cell_view ccv(c);
            auto shards = std::move(value).start_value_counter_cell_full()
                                          .start_shards();
-            for (auto csv : ccv.shards()) {
-                shards.add_shards(counter_shard(csv));
+            if (service::get_local_storage_service().cluster_supports_correct_counter_order()) {
+                for (auto csv : ccv.shards()) {
+                    shards.add_shards(counter_shard(csv));
+                }
+            } else {
+                for (auto& cs : ccv.shards_compatible_with_1_7_4()) {
+                    shards.add_shards(cs);
+                }
            }
            return std::move(shards).end_shards().end_counter_cell_full();
        }
--- a/mutation_partition_view.cc
+++ b/mutation_partition_view.cc
@@ -73,8 +73,9 @@ atomic_cell read_atomic_cell(atomic_cell_variant cv)
                    // TODO: a lot of copying for something called view
                    counter_cell_builder ccb; // we know the final number of shards
                    for (auto csv : ccv.shards()) {
-                        ccb.add_shard(counter_shard(csv));
+                        ccb.add_maybe_unsorted_shard(counter_shard(csv));
                    }
+                    ccb.sort_and_remove_duplicates();
                    return ccb.build(_created_at);
                }
                atomic_cell operator()(ser::counter_cell_update_view& ccv) const {
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -282,10 +282,12 @@ private:
 public:
    multi_range_mutation_reader(schema_ptr s, mutation_source source, const ranges_vector& ranges,
                                const query::partition_slice& slice, const io_priority_class& pc,
-                                tracing::trace_state_ptr trace_state)
+                                tracing::trace_state_ptr trace_state,
+                                mutation_reader::forwarding fwd_mr)
        : _ranges(ranges)
        , _current_range(_ranges.begin())
-        , _reader(source(s, *_current_range, slice, pc, trace_state))
+        , _reader(source(s, *_current_range, slice, pc, trace_state,
+            _ranges.size() > 1 ? mutation_reader::forwarding::yes : fwd_mr))
    {
    }

@@ -317,8 +319,9 @@ public:
 mutation_reader
 make_multi_range_reader(schema_ptr s, mutation_source source, const dht::partition_range_vector& ranges,
                        const query::partition_slice& slice, const io_priority_class& pc,
-                        tracing::trace_state_ptr trace_state)
+                        tracing::trace_state_ptr trace_state,
+                        mutation_reader::forwarding fwd_mr)
 {
    return make_mutation_reader<multi_range_mutation_reader>(std::move(s), std::move(source), ranges,
-                                                             slice, pc, std::move(trace_state));
+                                                             slice, pc, std::move(trace_state), fwd_mr);
 }
--- a/mutation_reader.hh
+++ b/mutation_reader.hh
@@ -50,6 +50,20 @@
 // not be the optimal object to use here.
 class mutation_reader final {
 public:
+    // mutation_reader::forwarding determines whether fast_forward_to() may
+    // be used on the mutation reader to change the partition range being
+    // read. Enabling forwarding also changes read policy: forwarding::no
+    // means we will stop reading from disk at the end of the given range,
+    // but with forwarding::yes we may read ahead, anticipating the user to
+    // make a small skip with fast_forward_to() and continuing to read.
+    //
+    // Note that mutation_reader::forwarding is similarly name but different
+    // from streamed_mutation::forwarding - the former is about skipping to
+    // a different partition range, while the latter is about skipping
+    // inside a large partition.
+    class forwarding_tag;
+    using forwarding = bool_class<forwarding_tag>;
+
    class impl {
    public:
        virtual ~impl() {}
@@ -253,34 +267,45 @@ future<> consume(mutation_reader& reader, Consumer consumer) {
 class mutation_source {
    using partition_range = const dht::partition_range&;
    using io_priority = const io_priority_class&;
-    std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&, io_priority, tracing::trace_state_ptr)> _fn;
+    using func_type = std::function<mutation_reader(schema_ptr,
+        partition_range,
+        const query::partition_slice&,
+        io_priority,
+        tracing::trace_state_ptr,
+        mutation_reader::forwarding
+    )>;
+    // We could have our own version of std::function<> that is nothrow
+    // move constructible and save some indirection and allocation.
+    // Probably not worth the effort though.
+    std::unique_ptr<func_type> _fn;
+private:
+    mutation_source() = default;
+    explicit operator bool() const { return bool(_fn); }
+    friend class optimized_optional<mutation_source>;
 public:
-    mutation_source(std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&, io_priority, tracing::trace_state_ptr)> fn)
-            : _fn(std::move(fn)) {}
-    mutation_source(std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&, io_priority)> fn)
-        : _fn([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr) {
-            return fn(s, range, slice, pc);
-        }) {}
+    mutation_source(func_type fn) : _fn(std::make_unique<func_type>(std::move(fn))) {}
    mutation_source(std::function<mutation_reader(schema_ptr, partition_range, const query::partition_slice&)> fn)
-        : _fn([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority, tracing::trace_state_ptr) {
+        : _fn(std::make_unique<func_type>([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority, tracing::trace_state_ptr, mutation_reader::forwarding) {
            return fn(s, range, slice);
-        }) {}
+        })) {}
    mutation_source(std::function<mutation_reader(schema_ptr, partition_range range)> fn)
-        : _fn([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice&, io_priority, tracing::trace_state_ptr) {
+        : _fn(std::make_unique<func_type>([fn = std::move(fn)] (schema_ptr s, partition_range range, const query::partition_slice&, io_priority, tracing::trace_state_ptr, mutation_reader::forwarding) {
            return fn(s, range);
-        }) {}
-
-    mutation_reader operator()(schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc, tracing::trace_state_ptr trace_state) const {
-        return _fn(std::move(s), range, slice, pc, std::move(trace_state));
+        })) {}
+    mutation_source(const mutation_source& other)
+        : _fn(std::make_unique<func_type>(*other._fn)) { }
+    mutation_source& operator=(const mutation_source& other) {
+        _fn = std::make_unique<func_type>(*other._fn);
+        return *this;
    }
-    mutation_reader operator()(schema_ptr s, partition_range range, const query::partition_slice& slice, io_priority pc) const {
-        return _fn(std::move(s), range, slice, pc, nullptr);
-    }
-    mutation_reader operator()(schema_ptr s, partition_range range, const query::partition_slice& slice) const {
-        return _fn(std::move(s), range, slice, default_priority_class(), nullptr);
-    }
-    mutation_reader operator()(schema_ptr s, partition_range range) const {
-        return _fn(std::move(s), range, query::full_slice, default_priority_class(), nullptr);
+    mutation_source(mutation_source&&) = default;
+    mutation_source& operator=(mutation_source&&) = default;
+    mutation_reader operator()(schema_ptr s, partition_range range,
+        const query::partition_slice& slice = query::full_slice,
+        io_priority pc = default_priority_class(),
+        tracing::trace_state_ptr trace_state = nullptr,
+        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const {
+        return (*_fn)(std::move(s), range, slice, pc, trace_state, fwd_mr);
    }
 };

@@ -430,4 +455,5 @@ stable_flattened_mutations_consumer<FlattenedConsumer> make_stable_flattened_mut
 mutation_reader
 make_multi_range_reader(schema_ptr s, mutation_source source, const dht::partition_range_vector& ranges,
                        const query::partition_slice& slice, const io_priority_class& pc = default_priority_class(),
-                        tracing::trace_state_ptr trace_state = nullptr);
+                        tracing::trace_state_ptr trace_state = nullptr,
+                        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes);
--- a/query-result.hh
+++ b/query-result.hh
@@ -345,7 +345,7 @@ public:
        : _w(std::move(w))
        , _row_count(c)
        , _short_read(sr)
-        , _memory_tracker(std::move(_memory_tracker))
+        , _memory_tracker(std::move(memory_tracker))
        , _partition_count(pc)
    {
        w.reduce_chunk_count();
--- a/range.hh
+++ b/range.hh
@@ -601,13 +601,13 @@ private:
    struct built_in_ : std_ {};

    template<typename Range, typename LessComparator,
-             typename = decltype(&std::remove_reference<Range>::type::lower_bound)>
+             typename = decltype(std::declval<Range>().lower_bound(std::declval<T>(), std::declval<LessComparator>()))>
    typename std::remove_reference<Range>::type::const_iterator do_lower_bound(const T& value, Range&& r, LessComparator&& cmp, built_in_) const {
        return r.lower_bound(value, std::forward<LessComparator>(cmp));
    }

    template<typename Range, typename LessComparator,
-             typename = decltype(&std::remove_reference<Range>::type::upper_bound)>
+             typename = decltype(std::declval<Range>().upper_bound(std::declval<T>(), std::declval<LessComparator>()))>
    typename std::remove_reference<Range>::type::const_iterator do_upper_bound(const T& value, Range&& r, LessComparator&& cmp, built_in_) const {
        return r.upper_bound(value, std::forward<LessComparator>(cmp));
    }
@@ -649,6 +649,21 @@ public:
        return boost::make_iterator_range(lower_bound(range, cmp), upper_bound(range, cmp));
    }

+    // Returns the intersection between this range and other.
+    template<typename Comparator>
+    stdx::optional<nonwrapping_range> intersection(const nonwrapping_range& other, Comparator&& cmp) const {
+        auto p = std::minmax(_range, other._range, [&cmp] (auto&& a, auto&& b) {
+            return wrapping_range<T>::less_than(a.start_bound(), b.start_bound(), cmp);
+        });
+        if (wrapping_range<T>::greater_than_or_equal(p.first.end_bound(), p.second.start_bound(), cmp)) {
+            auto end = std::min(p.first.end_bound(), p.second.end_bound(), [&cmp] (auto&& a, auto&& b) {
+                return !wrapping_range<T>::greater_than_or_equal(a, b, cmp);
+            });
+            return nonwrapping_range(p.second.start(), end.b);
+        }
+        return {};
+    }
+
    template<typename U>
    friend std::ostream& operator<<(std::ostream& out, const nonwrapping_range<U>& r);
 };
--- a/repair/range_split.hh
+++ b/repair/range_split.hh
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stack>
+
+#include "dht/i_partitioner.hh"
+
+// range_splitter(r, N, K) is a helper for splitting a given token_range r of
+// estimated size N into many small ranges of size K, and later iterating
+// over those small ranges once with the has_next() and next() methods.
+// This implementation assumes only the availability of a range::midpoint()
+// operation, and as result creates ranges with size between K/2 and K.
+// Moreover, it has memory requirement log(N). With more general arithmetic
+// support over tokens, we could get exactly K and O(1) memory.
+class range_splitter {
+    std::stack<std::pair<::dht::token_range, float>> _stack;
+    uint64_t _desired;
+public:
+    range_splitter(::dht::token_range r, uint64_t N, uint64_t K) {
+        _stack.push({r, N});
+        _desired = K;
+    }
+    bool has_next() const {
+        return !_stack.empty();
+    }
+    ::dht::token_range next() {
+        // If the head range's estimated size is small enough, return it.
+        // Otherwise split it to two halves, push the second half on the
+        // stack, and repeat with the first half. May need to do this more
+        // than once (up to log(N/K) times) until we have one range small
+        // enough to return.
+        assert(!_stack.empty());
+        auto range = _stack.top().first;
+        auto size = _stack.top().second;
+        _stack.pop();
+        while (size > _desired) {
+            // The use of minimum_token() here twice is not a typo - because wrap-
+            // around token ranges are supported by midpoint(), the beyond-maximum
+            // token can also be represented by minimum_token().
+            auto midpoint = dht::global_partitioner().midpoint(
+                    range.start() ? range.start()->value() : dht::minimum_token(),
+                    range.end() ? range.end()->value() : dht::minimum_token());
+            // This shouldn't happen, but if the range included just one token, we
+            // can't split further (split() may actually fail with assertion failure)
+            if ((range.start() && midpoint == range.start()->value()) ||
+                (range.end() && midpoint == range.end()->value())) {
+                return range;
+            }
+            auto halves = range.split(midpoint, dht::token_comparator());
+            _stack.push({halves.second, size / 2.0});
+            range = halves.first;
+            size /= 2.0;
+        }
+        return range;
+    }
+};
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -20,6 +20,7 @@
 */

 #include "repair.hh"
+#include "range_split.hh"

 #include "streaming/stream_plan.hh"
 #include "streaming/stream_state.hh"
@@ -40,11 +41,6 @@

 static logging::logger logger("repair");

-struct failed_range {
-    sstring cf;
-    ::dht::token_range range;
-};
-
 class repair_info {
 public:
    seastar::sharded<database>& db;
@@ -52,15 +48,25 @@ public:
    dht::token_range_vector ranges;
    std::vector<sstring> cfs;
    int id;
+    shard_id shard;
    std::vector<sstring> data_centers;
    std::vector<sstring> hosts;
-    std::vector<failed_range> failed_ranges;
-    streaming::stream_plan sp_in;
-    streaming::stream_plan sp_out;
+    size_t nr_failed_ranges = 0;
+    // Map of peer -> <cf, ranges>
+    std::unordered_map<gms::inet_address, std::unordered_map<sstring, dht::token_range_vector>> ranges_need_repair_in;
+    std::unordered_map<gms::inet_address, std::unordered_map<sstring, dht::token_range_vector>> ranges_need_repair_out;
    // FIXME: this "100" needs to be a parameter.
    uint64_t target_partitions = 100;
-    // FIXME: this "10 * 1024 * 1024" needs to be a parameter.
-    size_t sub_ranges_max = 10 * 1024 * 1024;
+    // This affects how many ranges we put in a stream plan. The more the more
+    // memory we use to store the ranges in memory. However, it can reduce the
+    // total number of stream_plan we use for the repair.
+    size_t sub_ranges_to_stream = 10 * 1024;
+    size_t sp_index = 0;
+    size_t current_sub_ranges_nr_in = 0;
+    size_t current_sub_ranges_nr_out = 0;
+    int ranges_index = 0;
+    // Only allow one stream_plan in flight
+    semaphore sp_parallelism_semaphore{1};
 public:
    repair_info(seastar::sharded<database>& db_,
            const sstring& keyspace_,
@@ -74,42 +80,81 @@ public:
        , ranges(ranges_)
        , cfs(cfs_)
        , id(id_)
+        , shard(engine().cpu_id())
        , data_centers(data_centers_)
-        , hosts(hosts_)
-        , sp_in(streaming::stream_plan(sprint("repair-in-%d", id)))
-        , sp_out(streaming::stream_plan(sprint("repair-out-%d", id))) {
-
+        , hosts(hosts_) {
    }
    future<> do_streaming() {
-        return sp_in.execute().discard_result().then([this] {
-                return sp_out.execute().discard_result();
+        size_t ranges_in = 0;
+        size_t ranges_out = 0;
+        auto sp_in = make_lw_shared<streaming::stream_plan>(sprint("repair-in-id-%d-shard-%d-index-%d", id, shard, sp_index));
+        auto sp_out = make_lw_shared<streaming::stream_plan>(sprint("repair-out-id-%d-shard-%d-index-%d", id, shard, sp_index));
+
+        for (auto& x : ranges_need_repair_in) {
+            auto& peer = x.first;
+            for (auto& y : x.second) {
+                auto& cf = y.first;
+                auto& stream_ranges = y.second;
+                ranges_in += stream_ranges.size();
+                sp_in->request_ranges(peer, keyspace, std::move(stream_ranges), {cf});
+            }
+        }
+        ranges_need_repair_in.clear();
+        current_sub_ranges_nr_in = 0;
+
+        for (auto& x : ranges_need_repair_out) {
+            auto& peer = x.first;
+            for (auto& y : x.second) {
+                auto& cf = y.first;
+                auto& stream_ranges = y.second;
+                ranges_out += stream_ranges.size();
+                sp_out->transfer_ranges(peer, keyspace, std::move(stream_ranges), {cf});
+            }
+        }
+        ranges_need_repair_out.clear();
+        current_sub_ranges_nr_out = 0;
+
+        if (ranges_in || ranges_out) {
+            logger.info("Start streaming for repair id={}, shard={}, index={}, ranges_in={}, ranges_out={}", id, shard, sp_index, ranges_in, ranges_out);
+        }
+        sp_index++;
+
+        return sp_in->execute().discard_result().then([sp_in, sp_out] {
+                return sp_out->execute().discard_result();
        }).handle_exception([] (auto ep) {
            logger.warn("repair's stream failed: {}", ep);
            return make_exception_future(ep);
        });
    }
-    bool check_failed_ranges() {
-        if (failed_ranges.empty()) {
-            logger.info("repair {} completed successfully", id);
-            return true;
+    void check_failed_ranges() {
+        if (nr_failed_ranges) {
+            logger.info("repair {} on shard {} failed - {} ranges failed", id, shard, nr_failed_ranges);
+            throw std::runtime_error(sprint("repair %d on shard %d failed to do checksum for %d sub ranges", id, shard, nr_failed_ranges));
        } else {
-            for (auto& frange: failed_ranges) {
-                logger.debug("repair cf {} range {} failed", frange.cf, frange.range);
-            }
-            logger.info("repair {} failed - {} ranges failed", id, failed_ranges.size());
-            return false;
+            logger.info("repair {} on shard {} completed successfully", id, shard);
        }
    }
-    void request_transfer_ranges(const sstring& cf,
+    future<> request_transfer_ranges(const sstring& cf,
        const ::dht::token_range& range,
        const std::vector<gms::inet_address>& neighbors_in,
        const std::vector<gms::inet_address>& neighbors_out) {
-        for (const auto& peer : neighbors_in) {
-            sp_in.request_ranges(peer, keyspace, {range}, {cf});
-        }
-        for (const auto& peer : neighbors_out) {
-            sp_out.transfer_ranges(peer, keyspace, {range}, {cf});
-        }
+        logger.debug("Add cf {}, range {}, current_sub_ranges_nr_in {}, current_sub_ranges_nr_out {}", cf, range, current_sub_ranges_nr_in, current_sub_ranges_nr_out);
+        return sp_parallelism_semaphore.wait(1).then([this, cf, range, neighbors_in, neighbors_out] {
+            for (const auto& peer : neighbors_in) {
+                ranges_need_repair_in[peer][cf].emplace_back(range);
+                current_sub_ranges_nr_in++;
+            }
+            for (const auto& peer : neighbors_out) {
+                ranges_need_repair_out[peer][cf].emplace_back(range);
+                current_sub_ranges_nr_out++;
+            }
+            if (current_sub_ranges_nr_in >= sub_ranges_to_stream || current_sub_ranges_nr_out >= sub_ranges_to_stream) {
+                return do_streaming();
+            }
+            return make_ready_future<>();
+        }).finally([this] {
+            sp_parallelism_semaphore.signal(1);
+        });
    }
 };

@@ -268,7 +313,7 @@ static std::vector<gms::inet_address> get_neighbors(database& db,
 // be queried about more than once (FIXME: reconsider this. But note that
 // failed repairs should be rare anwyay).
 // This object is not thread safe, and must be used by only one cpu.
-static class {
+class tracker {
 private:
    // Each repair_start() call returns a unique int which the user can later
    // use to follow the status of this repair with repair_status().
@@ -281,7 +326,11 @@ private:
    std::unordered_map<int, repair_status> _status;
    // Used to allow shutting down repairs in progress, and waiting for them.
    seastar::gate _gate;
+    // Set when the repair service is being shutdown
+    std::atomic_bool _shutdown alignas(64);
 public:
+    tracker() : _shutdown(false) {
+    }
    void start(int id) {
        _gate.enter();
        _status[id] = repair_status::RUNNING;
@@ -309,17 +358,19 @@ public:
        return _next_repair_command++;
    }
    future<> shutdown() {
+        _shutdown.store(true, std::memory_order_relaxed);
        return _gate.close();
    }
    void check_in_shutdown() {
-        _gate.check();
+        if (_shutdown.load(std::memory_order_relaxed)) {
+            throw std::runtime_error(sprint("Repair service is being shutdown"));
+        }
    }
-} repair_tracker;
+};
+
+static tracker repair_tracker;

 static void check_in_shutdown() {
-    // Only call this from the single CPU managing the repair - the only CPU
-    // which is allowed to use repair_tracker.
-    assert(engine().cpu_id() == 0);
    repair_tracker.check_in_shutdown();
 }

@@ -445,6 +496,19 @@ static future<partition_checksum> checksum_range_shard(database &db,
    });
 }

+// It is counter-productive to allow a large number of range checksum
+// operations to proceed in parallel (on the same shard), because the read
+// operation can already parallelize itself as much as needed, and doing
+// multiple reads in parallel just adds a lot of memory overheads.
+// So checksum_parallelism_semaphore is used to limit this parallelism,
+// and should be set to 1, or another small number.
+//
+// Note that checksumming_parallelism_semaphore applies not just in the
+// repair master, but also in the slave: The repair slave may receive many
+// checksum requests in parallel, but will only work on one or a few
+// (checksum_parallelism_semaphore) at once.
+static thread_local semaphore checksum_parallelism_semaphore(2);
+
 // Calculate the checksum of the data held on all shards of a column family,
 // in the given token range.
 // In practice, we only need to consider one or two shards which intersect the
@@ -467,7 +531,9 @@ future<partition_checksum> checksum_range(seastar::sharded<database> &db,
            auto& prs = shard_range.second;
            return db.invoke_on(shard, [keyspace, cf, prs = std::move(prs), hash_version] (database& db) mutable {
                return do_with(std::move(keyspace), std::move(cf), std::move(prs), [&db, hash_version] (auto& keyspace, auto& cf, auto& prs) {
-                    return checksum_range_shard(db, keyspace, cf, prs, hash_version);
+                    return with_semaphore(checksum_parallelism_semaphore, 1, [&db, hash_version, &keyspace, &cf, &prs] {
+                        return checksum_range_shard(db, keyspace, cf, prs, hash_version);
+                    });
                });
            }).then([&result] (partition_checksum sum) {
                result.add(sum);
@@ -478,32 +544,15 @@ future<partition_checksum> checksum_range(seastar::sharded<database> &db,
    });
 }

-static void split_and_add(std::vector<::dht::token_range>& ranges,
-        const dht::token_range& range,
-        uint64_t estimated_partitions, uint64_t target_partitions) {
-    if (estimated_partitions < target_partitions) {
-        // We're done, the range is small enough to not be split further
-        ranges.push_back(range);
-        return;
-    }
-    // The use of minimum_token() here twice is not a typo - because wrap-
-    // around token ranges are supported by midpoint(), the beyond-maximum
-    // token can also be represented by minimum_token().
-    auto midpoint = dht::global_partitioner().midpoint(
-            range.start() ? range.start()->value() : dht::minimum_token(),
-            range.end() ? range.end()->value() : dht::minimum_token());
-    auto halves = range.split(midpoint, dht::token_comparator());
-    ranges.push_back(halves.first);
-    ranges.push_back(halves.second);
-}
-// We don't need to wait for one checksum to finish before we start the
-// next, but doing too many of these operations in parallel also doesn't
-// make sense, so we limit the number of concurrent ongoing checksum
-// requests with a semaphore.
-//
-// FIXME: We shouldn't use a magic number here, but rather bind it to
-// some resource. Otherwise we'll be doing too little in some machines,
-// and too much in others.
+// parallelism_semaphore limits the number of parallel ongoing checksum
+// comparisons. This could mean, for example, that this number of checksum
+// requests have been sent to other nodes and we are waiting for them to
+// return so we can compare those to our own checksums. This limit can be
+// set fairly high because the outstanding comparisons take only few
+// resources. In particular, we do NOT do this number of file reads in
+// parallel because file reads have large memory overhads (read buffers,
+// partitions, etc.) - the number of concurrent reads is further limited
+// by an additional semaphore checksum_parallelism_semaphore (see above).
 //
 // FIXME: This would be better of in a repair service, or even a per-shard
 // repair instance holding all repair state. However, since we are anyway
@@ -512,6 +561,24 @@ static void split_and_add(std::vector<::dht::token_range>& ranges,
 constexpr int parallelism = 100;
 static thread_local semaphore parallelism_semaphore(parallelism);

+static future<uint64_t> estimate_partitions(seastar::sharded<database>& db, const sstring& keyspace,
+        const sstring& cf, const dht::token_range& range) {
+    return db.map_reduce0(
+        [keyspace, cf, range] (auto& db) {
+            // FIXME: column_family should have a method to estimate the number of
+            // partitions (and of course it should use cardinality estimation bitmaps,
+            // not trivial sum). We shouldn't have this ugly code here...
+            // FIXME: If sstables are shared, they will be accounted more than
+            // once. However, shared sstables should exist for a short-time only.
+            auto sstables = db.find_column_family(keyspace, cf).get_sstables();
+            return boost::accumulate(*sstables, uint64_t(0),
+                [&range] (uint64_t x, auto&& sst) { return x + sst->estimated_keys_for_range(range); });
+        },
+        uint64_t(0),
+        std::plus<uint64_t>()
+    );
+}
+
 // Repair a single cf in a single local range.
 // Comparable to RepairJob in Origin.
 static future<> repair_cf_range(repair_info& ri,
@@ -522,42 +589,15 @@ static future<> repair_cf_range(repair_info& ri,
        return make_ready_future<>();
    }

-    std::vector<::dht::token_range> ranges;
-    ranges.push_back(range);
-
-    // Additionally, we want to break up large ranges so they will have
-    // (approximately) a desired number of rows each.
-    // FIXME: column_family should have a method to estimate the number of
-    // partitions (and of course it should use cardinality estimation bitmaps,
-    // not trivial sum). We shouldn't have this ugly code here...
-    auto sstables = ri.db.local().find_column_family(ri.keyspace, cf).get_sstables();
-    uint64_t estimated_partitions = 0;
-    for (auto sst : *sstables) {
-        estimated_partitions += sst->estimated_keys_for_range(range);
-    }
-
-    // FIXME: we should have an on-the-fly iterator generator here, not
-    // fill a vector in advance.
-    std::vector<::dht::token_range> tosplit;
-    while (estimated_partitions > ri.target_partitions) {
-        tosplit.clear();
-        ranges.swap(tosplit);
-        for (const auto& range : tosplit) {
-            split_and_add(ranges, range, estimated_partitions, ri.target_partitions);
-        }
-        estimated_partitions /= 2;
-        if (ranges.size() >= ri.sub_ranges_max) {
-            break;
-        }
-    }
-    logger.debug("target_partitions={}, estimated_partitions={}, ranges.size={}, range={} -> ranges={}",
-                  ri.target_partitions, estimated_partitions, ranges.size(), range, ranges);
-
+    return estimate_partitions(ri.db, ri.keyspace, cf, range).then([&ri, cf, range, &neighbors] (uint64_t estimated_partitions) {
+    range_splitter ranges(range, estimated_partitions, ri.target_partitions);
    return do_with(seastar::gate(), true, std::move(cf), std::move(ranges),
        [&ri, &neighbors] (auto& completion, auto& success, const auto& cf, auto& ranges) {
-        return do_for_each(ranges, [&ri, &completion, &success, &neighbors, &cf] (const auto& range) {
+        return do_until([&ranges] () { return !ranges.has_next(); },
+            [&ranges, &ri, &completion, &success, &neighbors, &cf] () {
+            auto range = ranges.next();
            check_in_shutdown();
-            return parallelism_semaphore.wait(1).then([&ri, &completion, &success, &neighbors, &cf, &range] {
+            return parallelism_semaphore.wait(1).then([&ri, &completion, &success, &neighbors, &cf, range] {
                auto checksum_type = service::get_local_storage_service().cluster_supports_large_partitions()
                                     ? repair_checksum::streamed : repair_checksum::legacy;

@@ -575,7 +615,7 @@ static future<> repair_cf_range(repair_info& ri,

                completion.enter();
                when_all(checksums.begin(), checksums.end()).then(
-                        [&ri, &cf, &range, &neighbors, &success]
+                        [&ri, &cf, range, &neighbors, &success]
                        (std::vector<future<partition_checksum>> checksums) {
                    // If only some of the replicas of this range are alive,
                    // we set success=false so repair will fail, but we can
@@ -591,7 +631,7 @@ static future<> repair_cf_range(repair_info& ri,
                                 utils::fb_utilities::get_broadcast_address()),
                                checksums[i].get_exception());
                            success = false;
-                            ri.failed_ranges.push_back(failed_range{cf, range});
+                            ri.nr_failed_ranges++;
                            // Do not break out of the loop here, so we can log
                            // (and discard) all the exceptions.
                        } else if (i > 0) {
@@ -615,14 +655,24 @@ static future<> repair_cf_range(repair_info& ri,

                    auto node_reducer = [] (std::vector<gms::inet_address>& live_neighbors_in_or_out,
                            std::vector<gms::inet_address>& nodes_with_same_checksum, size_t nr_nodes_to_keep) {
+                        // nodes_with_same_checksum contains two types of nodes:
+                        // 1) the nodes we want to remove from live_neighbors_in_or_out.
+                        // 2) the nodes, nr_nodes_to_keep in number, not to remove from
+                        // live_neighbors_in_or_out
                        auto nr_nodes = nodes_with_same_checksum.size();
                        if (nr_nodes <= nr_nodes_to_keep) {
                            return;
                        }

-                        // TODO: Remove the "far" nodes and keep the "near" nodes
-                        // to have better streaming performance
-                        nodes_with_same_checksum.resize(nr_nodes - nr_nodes_to_keep);
+                        if (nr_nodes_to_keep == 0) {
+                            // All nodes in nodes_with_same_checksum will be removed from live_neighbors_in_or_out
+                        } else if (nr_nodes_to_keep == 1) {
+                            auto node_is_remote = [] (gms::inet_address ip) { return !service::get_local_storage_service().is_local_dc(ip); };
+                            boost::partition(nodes_with_same_checksum, node_is_remote);
+                            nodes_with_same_checksum.resize(nr_nodes - nr_nodes_to_keep);
+                        } else {
+                            throw std::runtime_error(sprint("nr_nodes_to_keep = {}, but it can only be 1 or 0", nr_nodes_to_keep));
+                        }

                        // Now, nodes_with_same_checksum contains nodes we want to remove, remove it from live_neighbors_in_or_out
                        auto it = boost::range::remove_if(live_neighbors_in_or_out, [&nodes_with_same_checksum] (const auto& ip) {
@@ -694,20 +744,19 @@ static future<> repair_cf_range(repair_info& ri,
                        }
                    }
                    if (!(live_neighbors_in.empty() && live_neighbors_out.empty())) {
-                        logger.info("Found differing range {} on nodes {}, in = {}, out = {}", range,
+                        logger.debug("Found differing range {} on nodes {}, in = {}, out = {}", range,
                                live_neighbors, live_neighbors_in, live_neighbors_out);
-                        ri.request_transfer_ranges(cf, range, live_neighbors_in, live_neighbors_out);
-                        return make_ready_future<>();
+                        return ri.request_transfer_ranges(cf, range, live_neighbors_in, live_neighbors_out);
                    }
                    return make_ready_future<>();
-                }).handle_exception([&ri, &success, &cf, &range] (std::exception_ptr eptr) {
+                }).handle_exception([&ri, &success, &cf, range] (std::exception_ptr eptr) {
                    // Something above (e.g., request_transfer_ranges) failed. We could
                    // stop the repair immediately, or let it continue with
                    // other ranges (at the moment, we do the latter). But in
                    // any case, we need to remember that the repair failed to
                    // tell the caller.
                    success = false;
-                    ri.failed_ranges.push_back(failed_range{cf, range});
+                    ri.nr_failed_ranges++;
                    logger.warn("Failed sync of range {}: {}", range, eptr);
                }).finally([&completion] {
                    parallelism_semaphore.signal(1);
@@ -727,6 +776,7 @@ static future<> repair_cf_range(repair_info& ri,
            });
        });
    });
+    });
 }

 // Repair a single local range, multiple column families.
@@ -945,22 +995,39 @@ private:
 // same nodes as replicas.
 static future<> repair_ranges(repair_info ri) {
    return do_with(std::move(ri), [] (auto& ri) {
-    #if 1
+    #if 0
        // repair all the ranges in parallel
        return parallel_for_each(ri.ranges, [&ri] (auto&& range) {
    #else
        // repair all the ranges in sequence
        return do_for_each(ri.ranges, [&ri] (auto&& range) {
    #endif
-            check_in_shutdown();
-            return repair_range(ri, range);
+            ri.ranges_index++;
+            logger.info("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}",
+                ri.ranges_index, ri.ranges.size(), ri.id, ri.shard, ri.keyspace, ri.cfs, range);
+            return do_with(dht::selective_token_range_sharder(range, ri.shard), [&ri] (auto& sharder) {
+                return repeat([&ri, &sharder] () {
+                    check_in_shutdown();
+                    auto range_shard = sharder.next();
+                    if (range_shard) {
+                        return repair_range(ri, *range_shard).then([] {
+                            return make_ready_future<stop_iteration>(stop_iteration::no);
+                        });
+                    } else {
+                        return make_ready_future<stop_iteration>(stop_iteration::yes);
+                    }
+                });
+            });
        }).then([&ri] {
+            // Do streaming for the remaining ranges we do not stream in
+            // repair_cf_range
            return ri.do_streaming();
        }).then([&ri] {
-            repair_tracker.done(ri.id, ri.check_failed_ranges());
+            ri.check_failed_ranges();
+            return make_ready_future<>();
        }).handle_exception([&ri] (std::exception_ptr eptr) {
            logger.info("repair {} failed - {}", ri.id, eptr);
-            repair_tracker.done(ri.id, false);
+            return make_exception_future<>(std::move(eptr));
        });
    });
 }
@@ -982,9 +1049,12 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
    // yet. Real ids returned by next_repair_command() will be >= 1.
    int id = repair_tracker.next_repair_command();
    logger.info("starting user-requested repair for keyspace {}, repair id {}, options {}", keyspace, id, options_map);
-
    repair_tracker.start(id);

+    if (!gms::get_local_gossiper().is_normal(utils::fb_utilities::get_broadcast_address())) {
+        throw std::runtime_error("Node is not in NORMAL status yet!");
+    }
+
    // If the "ranges" option is not explicitly specified, we repair all the
    // local ranges (the token ranges for which this node holds a replica of).
    // Each of these ranges may have a different set of replicas, so the
@@ -1057,8 +1127,33 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
        cfs = list_column_families(db.local(), keyspace);
    }

-    repair_ranges(repair_info(db, std::move(keyspace), std::move(ranges),
-            std::move(cfs), id, options.data_centers, options.hosts));
+
+    std::vector<future<>> repair_results;
+    repair_results.reserve(smp::count);
+
+    for (auto shard : boost::irange(unsigned(0), smp::count)) {
+        auto f = db.invoke_on(shard, [keyspace, cfs, id, ranges,
+                data_centers = options.data_centers, hosts = options.hosts] (database& localdb) mutable {
+            return repair_ranges(repair_info(service::get_local_storage_service().db(),
+                    std::move(keyspace), std::move(ranges), std::move(cfs),
+                    id, std::move(data_centers), std::move(hosts)));
+        });
+        repair_results.push_back(std::move(f));
+    }
+
+    when_all(repair_results.begin(), repair_results.end()).then([id] (std::vector<future<>> results) {
+        if (std::any_of(results.begin(), results.end(), [] (auto&& f) { return f.failed(); })) {
+            repair_tracker.done(id, false);
+            logger.info("repair {} failed", id);
+        } else {
+            repair_tracker.done(id, true);
+            logger.info("repair {} completed successfully", id);
+        }
+        return make_ready_future<>();
+    }).handle_exception([id] (std::exception_ptr eptr) {
+         repair_tracker.done(id, false);
+         logger.info("repair {} failed: {}", id, eptr);
+    });

    return id;
 }
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -315,6 +315,7 @@ public:
                        }
                        return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
                    } else {
+                        _delegate = make_empty_reader(); // See issue #2623
                        _cache.on_uncached_wide_partition();
                        _cache._tracker.on_wide_partition_mispopulation();
                        _cache.mark_partition_as_wide(dk);
@@ -460,6 +461,7 @@ private:
                    return;
                }
            }
+            _reader = {}; // See issue #2644
            _reader = _cache._underlying(_cache._schema, _range, query::full_slice, _pc, _trace_state);
        }
    }
@@ -471,6 +473,7 @@ private:
        _last_key.reset(dk, _populate_phase);

        _large_partition_range = dht::partition_range::make_singular(dk);
+        // FIXME: This may deadlock with _reader due to #2644. We can't reset _reader here, because it's still used after this.
        _large_partition_reader = _cache._underlying(_schema, _large_partition_range, _slice, _pc, _trace_state);
        return _large_partition_reader().then([this, dk = std::move(dk)] (auto smopt) mutable -> streamed_mutation_opt {
            _large_partition_reader = {};
@@ -564,6 +567,7 @@ public:

        if (!_reader_created || phase != _populate_phase) {
            _populate_phase = _cache._populate_phaser.phase();
+            _reader = {}; // See issue #2644
            _reader = _cache._underlying(_cache._schema, _range, query::full_slice, _pc, _trace_state);
            _reader_created = true;
            return make_ready_future();
@@ -579,6 +583,7 @@ class scanning_and_populating_reader final : public mutation_reader::impl {

    just_cache_scanning_reader _primary_reader;
    range_populating_reader _secondary_reader;
+    mutation_reader::forwarding _fwd_mr;
    streamed_mutation_opt _next_primary;
    bool _secondary_in_progress = false;
    bool _first_element = true;
@@ -655,11 +660,13 @@ public:
                                    const dht::partition_range& range,
                                    const query::partition_slice& slice,
                                    const io_priority_class& pc,
-                                    tracing::trace_state_ptr trace_state)
+                                    tracing::trace_state_ptr trace_state,
+                                    mutation_reader::forwarding fwd_mr)
        : _pr(range)
        , _schema(s)
        , _primary_reader(s, cache, range, slice, pc)
        , _secondary_reader(cache, s, slice, pc, trace_state)
+        , _fwd_mr(fwd_mr)
    { }

    future<streamed_mutation_opt> operator()() {
@@ -676,8 +683,9 @@ row_cache::make_scanning_reader(schema_ptr s,
                                const dht::partition_range& range,
                                const io_priority_class& pc,
                                const query::partition_slice& slice,
-                                tracing::trace_state_ptr trace_state) {
-    return make_mutation_reader<scanning_and_populating_reader>(std::move(s), *this, range, slice, pc, std::move(trace_state));
+                                tracing::trace_state_ptr trace_state,
+                                mutation_reader::forwarding fwd_mr) {
+    return make_mutation_reader<scanning_and_populating_reader>(std::move(s), *this, range, slice, pc, std::move(trace_state), fwd_mr);
 }

 mutation_reader
@@ -685,12 +693,13 @@ row_cache::make_reader(schema_ptr s,
                       const dht::partition_range& range,
                       const query::partition_slice& slice,
                       const io_priority_class& pc,
-                       tracing::trace_state_ptr trace_state) {
+                       tracing::trace_state_ptr trace_state,
+                       mutation_reader::forwarding fwd_mr) {
    if (range.is_singular()) {
        const query::ring_position& pos = range.start()->value();

        if (!pos.has_key()) {
-            return make_scanning_reader(std::move(s), range, pc, slice, std::move(trace_state));
+            return make_scanning_reader(std::move(s), range, pc, slice, std::move(trace_state), fwd_mr);
        }

        return _read_section(_tracker.region(), [&] {
@@ -703,7 +712,7 @@ row_cache::make_reader(schema_ptr s,
                upgrade_entry(e);
                mutation_reader reader;
                if (e.wide_partition()) {
-                    reader = _underlying(s, range, slice, pc, std::move(trace_state));
+                    reader = _underlying(s, range, slice, pc, std::move(trace_state), fwd_mr);
                    _tracker.on_uncached_wide_partition();
                    on_miss();
                } else {
@@ -721,7 +730,7 @@ row_cache::make_reader(schema_ptr s,
        });
    }

-    return make_scanning_reader(std::move(s), range, pc, slice, std::move(trace_state));
+    return make_scanning_reader(std::move(s), range, pc, slice, std::move(trace_state), fwd_mr);
 }

 row_cache::~row_cache() {
@@ -1023,12 +1032,13 @@ future<streamed_mutation_opt> cache_entry::read_wide(row_cache& rc, schema_ptr s
                : _range(std::move(pr))
                , _reader(rc._underlying(s, _range, slice, pc))
        { }
+        range_and_underlyig_reader(range_and_underlyig_reader&&) = delete;
    };
    rc._tracker.on_uncached_wide_partition();
    auto pr = dht::partition_range::make_singular(_key);
-    return do_with(range_and_underlyig_reader(rc, s, std::move(pr), slice, pc), [] (auto& r_a_ur) {
-        return r_a_ur._reader();
-    });
+    auto rd_ptr = std::make_unique<range_and_underlyig_reader>(rc, s, std::move(pr), slice, pc);
+    auto& r_a_ur = *rd_ptr;
+    return r_a_ur._reader().finally([rd_ptr = std::move(rd_ptr)] {});
 }

 streamed_mutation cache_entry::read(row_cache& rc, const schema_ptr& s) {
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -284,7 +284,8 @@ private:
                                         const dht::partition_range&,
                                         const io_priority_class& pc,
                                         const query::partition_slice& slice,
-                                         tracing::trace_state_ptr trace_state);
+                                         tracing::trace_state_ptr trace_state,
+                                         mutation_reader::forwarding);
    void on_hit();
    void on_miss();
    void on_uncached_wide_partition();
@@ -335,7 +336,8 @@ public:
                                const dht::partition_range& = query::full_partition_range,
                                const query::partition_slice& slice = query::full_slice,
                                const io_priority_class& = default_priority_class(),
-                                tracing::trace_state_ptr trace_state = nullptr);
+                                tracing::trace_state_ptr trace_state = nullptr,
+                                mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no);

    const stats& stats() const { return _stats; }
 public:
--- a/schema.cc
+++ b/schema.cc
@@ -145,6 +145,20 @@ void schema::rebuild() {

    thrift()._compound = is_compound();
    thrift()._is_dynamic = clustering_key_size() > 0;
+
+    if (default_validator()->is_counter()) {
+        for (auto&& cdef : boost::range::join(static_columns(), regular_columns())) {
+            if (!cdef.type->is_counter()) {
+                throw exceptions::configuration_exception(sprint("Cannot add a non counter column (%s) in a counter column family", cdef.name_as_text()));
+            }
+        }
+    } else {
+        for (auto&& cdef : all_columns()) {
+            if (cdef.second->type->is_counter()) {
+                throw exceptions::configuration_exception(sprint("Cannot add a counter column (%s) in a non counter column family", cdef.second->name_as_text()));
+            }
+        }
+    }
 }

 const column_mapping& schema::get_column_mapping() const {
@@ -737,6 +751,16 @@ schema_ptr schema_builder::build() {
    }

    prepare_dense_schema(new_raw);
+
+    if (_default_validator) {
+        new_raw._default_validator = *_default_validator;
+    } else if (new_raw._is_dense || !new_raw._is_compound) {
+        auto regular_column = std::find_if(new_raw._columns.begin(), new_raw._columns.end(), [] (auto&& col) {
+            return col.kind == column_kind::regular_column;
+        });
+        new_raw._default_validator = regular_column->type;
+    }
+
    return make_lw_shared<schema>(schema(new_raw));
 }

--- a/schema_builder.hh
+++ b/schema_builder.hh
@@ -31,6 +31,7 @@ private:
    schema::raw_schema _raw;
    std::experimental::optional<compact_storage> _compact_storage;
    std::experimental::optional<table_schema_version> _version;
+    std::experimental::optional<data_type> _default_validator;
    schema_builder(const schema::raw_schema&);
 public:
    schema_builder(const sstring& ks_name, const sstring& cf_name,
@@ -74,7 +75,7 @@ public:
    }

    schema_builder& set_default_validator(const data_type& validator) {
-        _raw._default_validator = validator;
+        _default_validator = {validator};
        return *this;
    }

--- a/schema_registry.cc
+++ b/schema_registry.cc
@@ -273,9 +273,9 @@ schema_ptr global_schema_ptr::get() const {
            s = local_schema_registry().get_or_load(e.version(), [&e](table_schema_version) {
                return e.frozen();
            });
-            if (e.is_synced()) {
-                s->registry_entry()->mark_synced();
-            }
+        }
+        if (e.is_synced()) {
+            s->registry_entry()->mark_synced();
        }
        return s;
    }
--- a/26
+++ b/26
@@ -31,6 +31,8 @@ import os
 import sys
 import subprocess
 import uuid
+import re
+import glob
 from pkg_resources import parse_version

 VERSION = "1.0"
@@ -69,6 +71,20 @@ def create_uuid_file(fl):
    with open(args.uuid_file, 'w') as myfile:
        myfile.write(str(uuid.uuid1()) + "\n")

+def get_repo_file(dir):
+    files = glob.glob(dir)
+    files.sort(key=os.path.getmtime, reverse=True)
+    for name in files:
+        with open(name, 'r') as myfile:
+            for line in myfile:
+                match = re.search(".*http.?://.*/scylladb/([^/\s]+)/deb/([^/\s]+)\s.*", line)
+                if match:
+                    return match.group(2), match.group(1)
+                match = re.search(".*http.?://.*/scylladb/([^/]+)/rpm/[^/]+/([^/\s]+)/.*", line)
+                if match:
+                    return match.group(2), match.group(1)
+    return None, None
+
 def check_version(ar):
    if config and (not config.has_option("housekeeping", "check-version") or not config.getboolean("housekeeping", "check-version")):
        return
@@ -87,6 +103,10 @@ def check_version(ar):
            params = params + "&sts=" + ar.mode
        if uid:
            params = params + "&uu=" + uid
+        if repo_id:
+            params = params + "&rid=" + repo_id
+        if repo_type:
+            params = params + "&rtype=" + repo_type
        latest_version = get_json_from_url(version_url + params)["version"]
    except:
        traceln("Unable to retrieve version information")
@@ -99,6 +119,7 @@ parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Q
 parser.add_argument('-c', '--config', default="", help='An optional config file. Specifying a missing file will terminate the script')
 parser.add_argument('--uuid', default="", help='A uuid for the requests')
 parser.add_argument('--uuid-file', default="", help='A uuid file for the requests')
+parser.add_argument('--repo-files', default="", help='The repository files that is been used for private repositories')

 subparsers = parser.add_subparsers(help='Available commands')
 parser_help = subparsers.add_parser('help', help='Display help information')
@@ -111,6 +132,9 @@ parser_system.set_defaults(func=check_version)
 args = parser.parse_args()
 quiet = args.quiet
 config = None
+repo_id = None
+repo_type = None
+
 if args.config != "":
    if not os.path.isfile(args.config):
        traceln("Config file ", args.config, " is missing, terminating")
@@ -125,4 +149,6 @@ if args.uuid_file != "":
        create_uuid_file(args.uuid_file)
    with open(args.uuid_file, 'r') as myfile:
        uid = myfile.read().replace('\n', '')
+if args.repo_files != "":
+    repo_type, repo_id = get_repo_file(args.repo_files)
 args.func(args)
--- a/2
+++ b/2
--- a/serialization_visitors.hh
+++ b/serialization_visitors.hh
@@ -115,4 +115,42 @@ inline frame<seastar::measuring_output_stream> start_frame(seastar::measuring_ou
    return { };
 }

+template<>
+class place_holder<seastar::simple_output_stream> {
+    seastar::simple_output_stream _substream;
+public:
+    place_holder(seastar::simple_output_stream substream)
+        : _substream(substream) { }
+
+    void set(seastar::simple_output_stream& out, size_type v) {
+        serialize(_substream, v);
+    }
+};
+
+template<>
+class frame<seastar::simple_output_stream> : public place_holder<seastar::simple_output_stream> {
+    char* _start;
+public:
+    frame(seastar::simple_output_stream ph, char* start)
+        : place_holder(ph), _start(start) { }
+
+    void end(seastar::simple_output_stream& out) {
+        set(out, out.begin() - _start);
+    }
+};
+
+inline place_holder<seastar::simple_output_stream> start_place_holder(seastar::simple_output_stream& out) {
+    return { out.write_substream(sizeof(size_type)) };
+}
+
+inline frame<seastar::simple_output_stream> start_frame(seastar::simple_output_stream& out) {
+    auto start = out.begin();
+    auto substream = out.write_substream(sizeof(size_type));
+    {
+        auto sstr = substream;
+        serialize(sstr, size_type(0));
+    }
+    return frame<seastar::simple_output_stream>(substream, start);
+}
+
 }
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -481,8 +481,10 @@ future<> migration_manager::announce_new_column_family(schema_ptr cfm, bool anno
            throw exceptions::already_exists_exception(cfm->ks_name(), cfm->cf_name());
        }
        logger.info("Create new ColumnFamily: {}", cfm);
-        auto mutations = db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, api::new_timestamp())
+            .then([announce_locally, this] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_keyspace& e) {
        throw exceptions::configuration_exception(sprint("Cannot add table '%s' to non existing keyspace '%s'.", cfm->cf_name(), cfm->ks_name()));
    }
@@ -501,8 +503,10 @@ future<> migration_manager::announce_column_family_update(schema_ptr cfm, bool f
 #endif
        logger.info("Update table '{}.{}' From {} To {}", cfm->ks_name(), cfm->cf_name(), *old_schema, *cfm);
        auto&& keyspace = db.find_keyspace(cfm->ks_name());
-        auto mutations = db::schema_tables::make_update_table_mutations(keyspace.metadata(), old_schema, cfm, api::new_timestamp(), from_thrift);
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_update_table_mutations(keyspace.metadata(), old_schema, cfm, api::new_timestamp(), from_thrift)
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_column_family& e) {
        throw exceptions::configuration_exception(sprint("Cannot update non existing table '%s' in keyspace '%s'.",
                                                         cfm->cf_name(), cfm->ks_name()));
@@ -512,8 +516,10 @@ future<> migration_manager::announce_column_family_update(schema_ptr cfm, bool f
 static future<> do_announce_new_type(user_type new_type, bool announce_locally) {
    auto& db = get_local_storage_proxy().get_db().local();
    auto&& keyspace = db.find_keyspace(new_type->_keyspace);
-    auto mutations = db::schema_tables::make_create_type_mutations(keyspace.metadata(), new_type, api::new_timestamp());
-    return migration_manager::announce(std::move(mutations), announce_locally);
+    return db::schema_tables::make_create_type_mutations(keyspace.metadata(), new_type, api::new_timestamp())
+        .then([announce_locally] (auto&& mutations) {
+            return migration_manager::announce(std::move(mutations), announce_locally);
+        });
 }

 future<> migration_manager::announce_new_type(user_type new_type, bool announce_locally) {
@@ -609,8 +615,10 @@ future<> migration_manager::announce_column_family_drop(const sstring& ks_name,
                        ks_name, ::join(", ", views | boost::adaptors::transformed([](auto&& v) { return v->cf_name(); }))));
        }
        logger.info("Drop table '{}.{}'", schema->ks_name(), schema->cf_name());
-        auto mutations = db::schema_tables::make_drop_table_mutations(db.find_keyspace(ks_name).metadata(), schema, api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_drop_table_mutations(db.find_keyspace(ks_name).metadata(), schema, api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_column_family& e) {
        throw exceptions::configuration_exception(sprint("Cannot drop non existing table '%s' in keyspace '%s'.", cf_name, ks_name));
    }
@@ -621,8 +629,10 @@ future<> migration_manager::announce_type_drop(user_type dropped_type, bool anno
    auto& db = get_local_storage_proxy().get_db().local();
    auto&& keyspace = db.find_keyspace(dropped_type->_keyspace);
    logger.info("Drop User Type: {}", dropped_type->get_name_as_string());
-    auto mutations = db::schema_tables::make_drop_type_mutations(keyspace.metadata(), dropped_type, api::new_timestamp());
-    return announce(std::move(mutations), announce_locally);
+    return db::schema_tables::make_drop_type_mutations(keyspace.metadata(), dropped_type, api::new_timestamp())
+        .then([announce_locally] (auto&& mutations) {
+            return announce(std::move(mutations), announce_locally);
+        });
 }

 future<> migration_manager::announce_new_view(view_ptr view, bool announce_locally)
@@ -637,8 +647,10 @@ future<> migration_manager::announce_new_view(view_ptr view, bool announce_local
            throw exceptions::already_exists_exception(view->ks_name(), view->cf_name());
        }
        logger.info("Create new view: {}", view);
-        auto mutations = db::schema_tables::make_create_view_mutations(keyspace, std::move(view), api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_create_view_mutations(keyspace, std::move(view), api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_keyspace& e) {
        throw exceptions::configuration_exception(sprint("Cannot add view '%s' to non existing keyspace '%s'.", view->cf_name(), view->ks_name()));
    }
@@ -660,8 +672,10 @@ future<> migration_manager::announce_view_update(view_ptr view, bool announce_lo
        oldCfm.validateCompatility(cfm);
 #endif
        logger.info("Update view '{}.{}' From {} To {}", view->ks_name(), view->cf_name(), *old_view, *view);
-        auto mutations = db::schema_tables::make_update_view_mutations(std::move(keyspace), view_ptr(old_view), std::move(view), api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_update_view_mutations(std::move(keyspace), view_ptr(old_view), std::move(view), api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const std::out_of_range& e) {
        throw exceptions::configuration_exception(sprint("Cannot update non existing materialized view '%s' in keyspace '%s'.",
                                                         view->cf_name(), view->ks_name()));
@@ -680,8 +694,10 @@ future<> migration_manager::announce_view_drop(const sstring& ks_name,
        }
        auto keyspace = db.find_keyspace(ks_name).metadata();
        logger.info("Drop view '{}.{}'", view->ks_name(), view->cf_name());
-        auto mutations = db::schema_tables::make_drop_view_mutations(std::move(keyspace), view_ptr(std::move(view)), api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_drop_view_mutations(std::move(keyspace), view_ptr(std::move(view)), api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_column_family& e) {
        throw exceptions::configuration_exception(sprint("Cannot drop non existing materialized view '%s' in keyspace '%s'.",
                                                         cf_name, ks_name));
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -478,7 +478,6 @@ inline uint64_t& storage_proxy::split_stats::get_ep_stat(gms::inet_address ep) {
 storage_proxy::~storage_proxy() {}
 storage_proxy::storage_proxy(distributed<database>& db) : _db(db) {
    namespace sm = seastar::metrics;
-
    _metrics.add_group(COORDINATOR_STATS_CATEGORY, {
        sm::make_queue_length("foreground_writes", [this] { return _stats.writes - _stats.background_writes; },
                       sm::description("number of currently pending foreground write requests")),
@@ -486,7 +485,7 @@ storage_proxy::storage_proxy(distributed<database>& db) : _db(db) {
        sm::make_queue_length("background_writes", [this] { return _stats.background_writes; },
                       sm::description("number of currently pending background write requests")),

-        sm::make_queue_length("throttled_writes", [this] { return _throttled_writes.size(); },
+        sm::make_queue_length("current_throttled_writes", [this] { return _throttled_writes.size(); },
                       sm::description("number of currently throttled write requests")),

        sm::make_total_operations("throttled_writes", [this] { return _stats.throttled_writes; },
@@ -1733,14 +1732,14 @@ protected:
    size_t _targets_count;
    promise<> _done_promise; // all target responded
    bool _timedout = false; // will be true if request timeouts
-    timer<lowres_clock> _timeout;
+    timer<storage_proxy::clock_type> _timeout;
    size_t _responses = 0;
    schema_ptr _schema;

    virtual void on_timeout() {}
    virtual size_t response_count() const = 0;
 public:
-    abstract_read_resolver(schema_ptr schema, db::consistency_level cl, size_t target_count, lowres_clock::time_point timeout)
+    abstract_read_resolver(schema_ptr schema, db::consistency_level cl, size_t target_count, storage_proxy::clock_type::time_point timeout)
        : _cl(cl)
        , _targets_count(target_count)
        , _schema(std::move(schema))
@@ -1796,7 +1795,7 @@ class digest_read_resolver : public abstract_read_resolver {
        return _digest_results.size();
    }
 public:
-    digest_read_resolver(schema_ptr schema, db::consistency_level cl, size_t block_for, lowres_clock::time_point timeout) : abstract_read_resolver(std::move(schema), cl, 0, timeout), _block_for(block_for) {}
+    digest_read_resolver(schema_ptr schema, db::consistency_level cl, size_t block_for, storage_proxy::clock_type::time_point timeout) : abstract_read_resolver(std::move(schema), cl, 0, timeout), _block_for(block_for) {}
    void add_data(gms::inet_address from, foreign_ptr<lw_shared_ptr<query::result>> result) {
        if (!_timedout) {
            // if only one target was queried digest_check() will be skipped so we can also skip digest calculation
@@ -2014,6 +2013,7 @@ private:
                break;
            }
        }
+        assert(last_partition);
        return get_last_row(s, *last_partition, is_reversed);
    }

@@ -2143,7 +2143,7 @@ private:
        return false;
    }
 public:
-    data_read_resolver(schema_ptr schema, db::consistency_level cl, size_t targets_count, lowres_clock::time_point timeout) : abstract_read_resolver(std::move(schema), cl, targets_count, timeout) {
+    data_read_resolver(schema_ptr schema, db::consistency_level cl, size_t targets_count, storage_proxy::clock_type::time_point timeout) : abstract_read_resolver(std::move(schema), cl, targets_count, timeout) {
        _data_results.reserve(targets_count);
    }
    void add_mutate_data(gms::inet_address from, foreign_ptr<lw_shared_ptr<reconcilable_result>> result) {
@@ -2230,6 +2230,10 @@ public:
                    v.emplace_back(r.from, stdx::optional<partition>(), r.reached_end, true);
                }
            }
+
+            boost::sort(v, [] (const version& x, const version& y) {
+                return x.from < y.from;
+            });
        } while(true);

        std::vector<mutation_and_live_row_count> reconciled_partitions;
@@ -2238,7 +2242,10 @@ public:
        // reconcile all versions
        boost::range::transform(boost::make_iterator_range(versions.begin(), versions.end()), std::back_inserter(reconciled_partitions),
                                [this, schema, original_per_partition_limit] (std::vector<version>& v) {
-            auto m = boost::accumulate(v, mutation(v.front().par->mut().key(*schema), schema), [this, schema] (mutation& m, const version& ver) {
+            auto it = boost::range::find_if(v, [] (auto&& ver) {
+                    return bool(ver.par);
+            });
+            auto m = boost::accumulate(v, mutation(it->par->mut().key(*schema), schema), [this, schema] (mutation& m, const version& ver) {
                if (ver.par) {
                    m.partition().apply(*schema, ver.par->mut().partition(), *schema);
                }
@@ -2330,7 +2337,7 @@ protected:
    using targets_iterator = std::vector<gms::inet_address>::iterator;
    using digest_resolver_ptr = ::shared_ptr<digest_read_resolver>;
    using data_resolver_ptr = ::shared_ptr<data_read_resolver>;
-    using clock_type = lowres_clock;
+    using clock_type = storage_proxy::clock_type;

    schema_ptr _schema;
    shared_ptr<storage_proxy> _proxy;
@@ -2454,7 +2461,7 @@ protected:
    uint32_t original_partition_limit() const {
        return _cmd->partition_limit;
    }
-    void reconcile(db::consistency_level cl, lowres_clock::time_point timeout, lw_shared_ptr<query::read_command> cmd) {
+    void reconcile(db::consistency_level cl, storage_proxy::clock_type::time_point timeout, lw_shared_ptr<query::read_command> cmd) {
        data_resolver_ptr data_resolver = ::make_shared<data_read_resolver>(_schema, cl, _targets.size(), timeout);
        auto exec = shared_from_this();

@@ -2529,12 +2536,12 @@ protected:
            }
        });
    }
-    void reconcile(db::consistency_level cl, lowres_clock::time_point timeout) {
+    void reconcile(db::consistency_level cl, storage_proxy::clock_type::time_point timeout) {
        reconcile(cl, timeout, _cmd);
    }

 public:
-    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(lowres_clock::time_point timeout) {
+    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) {
        digest_resolver_ptr digest_resolver = ::make_shared<digest_read_resolver>(_schema, _cl, _block_for, timeout);
        auto exec = shared_from_this();

@@ -2604,7 +2611,7 @@ public:
 class always_speculating_read_executor : public abstract_read_executor {
 public:
    using abstract_read_executor::abstract_read_executor;
-    virtual future<> make_requests(digest_resolver_ptr resolver, lowres_clock::time_point timeout) {
+    virtual future<> make_requests(digest_resolver_ptr resolver, storage_proxy::clock_type::time_point timeout) {
        resolver->add_wait_targets(_targets.size());
        // FIXME: consider disabling for CL=*ONE
        bool want_digest = true;
@@ -2615,10 +2622,10 @@ public:

 // this executor sends request to an additional replica after some time below timeout
 class speculating_read_executor : public abstract_read_executor {
-    timer<> _speculate_timer;
+    timer<storage_proxy::clock_type> _speculate_timer;
 public:
    using abstract_read_executor::abstract_read_executor;
-    virtual future<> make_requests(digest_resolver_ptr resolver, lowres_clock::time_point timeout) {
+    virtual future<> make_requests(digest_resolver_ptr resolver, storage_proxy::clock_type::time_point timeout) {
        _speculate_timer.set_callback([this, resolver, timeout] {
            if (!resolver->is_completed()) { // at the time the callback runs request may be completed already
                resolver->add_wait_targets(1); // we send one more request so wait for it too
@@ -2664,7 +2671,7 @@ class range_slice_read_executor : public abstract_read_executor {
 public:
    range_slice_read_executor(schema_ptr s, shared_ptr<storage_proxy> proxy, lw_shared_ptr<query::read_command> cmd, dht::partition_range pr, db::consistency_level cl, std::vector<gms::inet_address> targets, tracing::trace_state_ptr trace_state) :
                                    abstract_read_executor(std::move(s), std::move(proxy), std::move(cmd), std::move(pr), cl, targets.size(), std::move(targets), std::move(trace_state)) {}
-    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(lowres_clock::time_point timeout) override {
+    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) override {
        reconcile(_cl, timeout);
        return _result_promise.get_future();
    }
@@ -2795,7 +2802,7 @@ future<foreign_ptr<lw_shared_ptr<query::result>>>
 storage_proxy::query_singular(lw_shared_ptr<query::read_command> cmd, dht::partition_range_vector&& partition_ranges, db::consistency_level cl, tracing::trace_state_ptr trace_state) {
    std::vector<::shared_ptr<abstract_read_executor>> exec;
    exec.reserve(partition_ranges.size());
-    auto timeout = lowres_clock::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());
+    auto timeout = storage_proxy::clock_type::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());

    for (auto&& pr: partition_ranges) {
        if (!pr.is_singular()) {
@@ -2819,7 +2826,7 @@ storage_proxy::query_singular(lw_shared_ptr<query::read_command> cmd, dht::parti
 }

 future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>>
-storage_proxy::query_partition_key_range_concurrent(lowres_clock::time_point timeout, std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results,
+storage_proxy::query_partition_key_range_concurrent(storage_proxy::clock_type::time_point timeout, std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results,
        lw_shared_ptr<query::read_command> cmd, db::consistency_level cl, dht::partition_range_vector::iterator&& i,
        dht::partition_range_vector&& ranges, int concurrency_factor, tracing::trace_state_ptr trace_state,
        uint32_t remaining_row_count, uint32_t remaining_partition_count) {
@@ -2923,7 +2930,7 @@ storage_proxy::query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
    schema_ptr schema = local_schema_registry().get(cmd->schema_version);
    keyspace& ks = _db.local().find_keyspace(schema->ks_name());
    dht::partition_range_vector ranges;
-    auto timeout = lowres_clock::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());
+    auto timeout = storage_proxy::clock_type::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());

    // when dealing with LocalStrategy keyspaces, we can skip the range splitting and merging (which can be
    // expensive in clusters with vnodes)
@@ -3957,24 +3964,22 @@ storage_proxy::query_nonsingular_mutations_locally(schema_ptr s, lw_shared_ptr<q
    auto shard_cmd = make_lw_shared<query::read_command>(*cmd);
    return do_with(cmd,
            shard_cmd,
-            1u,
            0u,
            false,
            static_cast<unsigned>(prs.size()),
            std::unordered_map<element_and_shard, partition_range_and_sort_key>{},
            mutation_result_merger{s, cmd},
-            dht::ring_position_range_vector_sharder{prs},
+            dht::ring_position_exponential_vector_sharder{prs},
            global_schema_ptr(s),
            tracing::global_trace_state_ptr(std::move(trace_state)),
            [this, s, max_size] (lw_shared_ptr<query::read_command>& cmd,
                    lw_shared_ptr<query::read_command>& shard_cmd,
-                    unsigned& shards_in_parallel,
                    unsigned& mutation_result_merger_key,
                    bool& no_more_ranges,
                    unsigned& partition_range_count,
                    std::unordered_map<element_and_shard, partition_range_and_sort_key>& shards_for_this_iteration,
                    mutation_result_merger& mrm,
-                    dht::ring_position_range_vector_sharder& rprs,
+                    dht::ring_position_exponential_vector_sharder& rpevs,
                    global_schema_ptr& gs,
                    tracing::global_trace_state_ptr& gt) {
      return _db.local().get_result_memory_limiter().new_mutation_read(max_size).then([&, s] (query::result_memory_accounter ma) {
@@ -3985,36 +3990,32 @@ storage_proxy::query_nonsingular_mutations_locally(schema_ptr s, lw_shared_ptr<q
            // because we'll throw away most of the results.  So we'll exponentially increase
            // concurrency starting at 1, so we won't waste on dense tables and at most
            // `log(nr_shards) + ignore_msb_bits` latency multiplier for near-empty tables.
+            //
+            // We use the ring_position_exponential_vector_sharder to give us subranges that follow
+            // this scheme.
            shards_for_this_iteration.clear();
            // If we're reading from less than smp::count shards, then we can just append
            // each shard in order without sorting.  If we're reading from more, then
            // we'll read from some shards at least twice, so the partitions within will be
            // out-of-order wrt. other shards
+            auto this_iteration_subranges = rpevs.next(*s);
            auto retain_shard_order = true;
-            for (auto i = 0u; i < shards_in_parallel; ++i) {
-                auto now = rprs.next(*s);
-                if (!now) {
-                    no_more_ranges = true;
-                    break;
-                }
-                // Let's see if this is a new shard, or if we can expand an existing range
-                auto&& rng_ok = shards_for_this_iteration.emplace(element_and_shard{now->element, now->shard}, partition_range_and_sort_key{now->ring_range, i});
-                if (!rng_ok.second) {
-                    // We saw this shard already, enlarge the range (we know now->ring_range came from the same partition range;
-                    // otherwise it would have had a unique now->element).
-                    auto& rng = rng_ok.first->second.pr;
-                    rng = nonwrapping_range<dht::ring_position>(std::move(rng.start()), std::move(now->ring_range.end()));
-                    // This range is no longer ordered with respect to the others, so:
-                    retain_shard_order = false;
+            no_more_ranges = true;
+            if (this_iteration_subranges) {
+                no_more_ranges = false;
+                retain_shard_order = this_iteration_subranges->inorder;
+                auto sort_key = 0u;
+                for (auto&& now : this_iteration_subranges->per_shard_ranges) {
+                    shards_for_this_iteration.emplace(element_and_shard{this_iteration_subranges->element, now.shard}, partition_range_and_sort_key{now.ring_range, sort_key++});
                }
            }
+
            auto key_base = mutation_result_merger_key;

            // prepare for next iteration
            // Each iteration uses a merger key that is either i in the loop above (so in the range [0, shards_in_parallel),
            // or, the element index in prs (so in the range [0, partition_range_count).  Make room for sufficient keys.
-            mutation_result_merger_key += std::max(shards_in_parallel, partition_range_count);
-            shards_in_parallel *= 2;
+            mutation_result_merger_key += std::max(smp::count, partition_range_count);

            shard_cmd->partition_limit = cmd->partition_limit - mrm.partition_count();
            shard_cmd->row_limit = cmd->row_limit - mrm.row_count();
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -71,7 +71,7 @@ public:
 private:
    struct rh_entry {
        ::shared_ptr<abstract_write_response_handler> handler;
-        timer<lowres_clock> expire_timer;
+        timer<clock_type> expire_timer;
        rh_entry(::shared_ptr<abstract_write_response_handler>&& h, std::function<void()>&& cb);
    };

@@ -253,7 +253,7 @@ private:
    dht::partition_range_vector get_restricted_ranges(keyspace& ks, const schema& s, dht::partition_range range);
    float estimate_result_rows_per_range(lw_shared_ptr<query::read_command> cmd, keyspace& ks);
    static std::vector<gms::inet_address> intersection(const std::vector<gms::inet_address>& l1, const std::vector<gms::inet_address>& l2);
-    future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>> query_partition_key_range_concurrent(lowres_clock::time_point timeout,
+    future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>> query_partition_key_range_concurrent(clock_type::time_point timeout,
            std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results, lw_shared_ptr<query::read_command> cmd, db::consistency_level cl, dht::partition_range_vector::iterator&& i,
            dht::partition_range_vector&& ranges, int concurrency_factor, tracing::trace_state_ptr trace_state,
            uint32_t remaining_row_count, uint32_t remaining_partition_count);
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -84,6 +84,7 @@ static const sstring RANGE_TOMBSTONES_FEATURE = "RANGE_TOMBSTONES";
 static const sstring LARGE_PARTITIONS_FEATURE = "LARGE_PARTITIONS";
 static const sstring MATERIALIZED_VIEWS_FEATURE = "MATERIALIZED_VIEWS";
 static const sstring COUNTERS_FEATURE = "COUNTERS";
+static const sstring CORRECT_COUNTER_ORDER_FEATURE = "CORRECT_COUNTER_ORDER";

 distributed<storage_service> _the_storage_service;

@@ -123,6 +124,7 @@ sstring storage_service::get_config_supported_features() {
    std::vector<sstring> features = {
        RANGE_TOMBSTONES_FEATURE,
        LARGE_PARTITIONS_FEATURE,
+        CORRECT_COUNTER_ORDER_FEATURE,
    };
    if (service::get_local_storage_service()._db.local().get_config().experimental()) {
        features.push_back(MATERIALIZED_VIEWS_FEATURE);
@@ -476,16 +478,6 @@ void storage_service::join_token_ring(int delay) {
 #endif

    if (!_is_survey_mode) {
-        // We have to create the system_auth and system_traces keyspaces and
-        // their tables before Node moves to the NORMAL state so that other
-        // Nodes joining the newly created cluster and serializing on this event
-        // "see" these new objects and don't try to create them.
-        //
-        // Otherwise there is a high chance to hit the issue #420.
-        auth::auth::setup().get();
-        supervisor::notify("starting tracing");
-        tracing::tracing::start_tracing().get();
-
        // start participating in the ring.
        db::system_keyspace::set_bootstrap_state(db::system_keyspace::bootstrap_state::COMPLETED).get();
        set_tokens(_bootstrap_tokens);
@@ -501,6 +493,9 @@ void storage_service::join_token_ring(int delay) {
            logger.error(err.c_str());
            throw std::runtime_error(err);
        }
+        auth::auth::setup().get();
+        supervisor::notify("starting tracing");
+        tracing::tracing::start_tracing().get();
    } else {
        logger.info("Startup complete, but write survey mode is active, not becoming an active ring member. Use JMX (StorageService->joinRing()) to finalize ring joining.");
    }
@@ -1348,6 +1343,7 @@ future<> storage_service::init_server(int delay) {
        get_storage_service().invoke_on_all([] (auto& ss) {
            ss._range_tombstones_feature = gms::feature(RANGE_TOMBSTONES_FEATURE);
            ss._large_partitions_feature = gms::feature(LARGE_PARTITIONS_FEATURE);
+            ss._correct_counter_order_feature = gms::feature(CORRECT_COUNTER_ORDER_FEATURE);

            if (ss._db.local().get_config().experimental()) {
                ss._materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -262,6 +262,7 @@ private:
    gms::feature _large_partitions_feature;
    gms::feature _materialized_views_feature;
    gms::feature _counters_feature;
+    gms::feature _correct_counter_order_feature;

 public:
    void enable_all_features() {
@@ -269,6 +270,7 @@ public:
        _large_partitions_feature.enable();
        _materialized_views_feature.enable();
        _counters_feature.enable();
+        _correct_counter_order_feature.enable();
    }

    void finish_bootstrapping() {
@@ -2230,6 +2232,10 @@ public:
    bool cluster_supports_counters() const {
        return bool(_counters_feature);
    }
+
+    bool cluster_supports_correct_counter_order() const {
+        return bool(_correct_counter_order_feature);
+    }
 };

 inline future<> init_storage_service(distributed<database>& db) {
--- a/sstable_mutation_readers.hh
+++ b/sstable_mutation_readers.hh
@@ -31,9 +31,9 @@ class sstable_range_wrapping_reader final : public mutation_reader::impl {
 public:
    sstable_range_wrapping_reader(lw_shared_ptr<sstables::sstable> sst,
        schema_ptr s, const dht::partition_range& pr, const query::partition_slice& slice,
-        const io_priority_class& pc)
+        const io_priority_class& pc, mutation_reader::forwarding fwd_mr)
        : _sst(sst)
-        , _smr(sst->read_range_rows(std::move(s), pr, slice, pc)) {
+        , _smr(sst->read_range_rows(std::move(s), pr, slice, pc, fwd_mr)) {
    }
    virtual future<streamed_mutation_opt> operator()() override {
        return _smr.read();
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -47,6 +47,7 @@
 #include <boost/range/algorithm.hpp>
 #include <boost/range/adaptors.hpp>
 #include <boost/range/join.hpp>
+#include <boost/algorithm/cxx11/any_of.hpp>

 #include "core/future-util.hh"
 #include "core/pipe.hh"
@@ -382,11 +383,22 @@ get_fully_expired_sstables(column_family& cf, std::vector<sstables::shared_sstab
        }
    }

+    auto compacted_undeleted_gens = boost::copy_range<std::unordered_set<int64_t>>(cf.compacted_undeleted_sstables()
+        | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::generation)));
+    auto has_undeleted_ancestor = [&compacted_undeleted_gens] (auto& candidate) {
+        return boost::algorithm::any_of(candidate->ancestors(), [&compacted_undeleted_gens] (auto gen) {
+            return compacted_undeleted_gens.count(gen);
+        });
+    };
+
    // SStables that do not contain live data is added to list of possibly expired sstables.
    for (auto& candidate : compacting) {
        logger.debug("Checking if candidate of generation {} and max_deletion_time {} is expired, gc_before is {}",
                    candidate->generation(), candidate->get_stats_metadata().max_local_deletion_time, gc_before);
-        if (candidate->get_stats_metadata().max_local_deletion_time < gc_before) {
+        // A fully expired sstable which has an ancestor undeleted shouldn't be compacted because
+        // expired data won't be purged because undeleted sstables are taken into account when
+        // calculating max purgeable timestamp, and not doing it could lead to a compaction loop.
+        if (candidate->get_stats_metadata().max_local_deletion_time < gc_before && !has_undeleted_ancestor(candidate)) {
            logger.debug("Adding candidate of generation {} to list of possibly expired sstables", candidate->generation());
            candidates.push_back(candidate);
        } else {
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -242,11 +242,12 @@ void compaction_manager::submit_sstable_rewrite(column_family* cf, sstables::sha
    // sstable we are planning to work on:
    _compacting_sstables.insert(sst);
    auto task = make_lw_shared<compaction_manager::task>();
+    task->compacting_cf = cf;
    _tasks.push_back(task);
-    task->compaction_done = with_semaphore(sem, 1, [this, cf, sst] {
+    task->compaction_done = with_semaphore(sem, 1, [this, task, cf, sst] {
        _stats.active_tasks++;
-        if (_stopped) {
-            return make_ready_future<>();;
+        if (!can_proceed(task)) {
+            return make_ready_future<>();
        }
        return cf->compact_sstables(sstables::compaction_descriptor(
                std::vector<sstables::shared_sstable>{sst},
@@ -462,6 +463,14 @@ future<> compaction_manager::perform_cleanup(column_family* cf) {
 }

 future<> compaction_manager::remove(column_family* cf) {
+    // FIXME: better way to iterate through compaction info for a given column family,
+    // although this path isn't performance sensitive.
+    for (auto& info : _compactions) {
+        if (cf->schema()->ks_name() == info->ks && cf->schema()->cf_name() == info->cf) {
+            info->stop("column family removal");
+        }
+    }
+
    // We need to guarantee that a task being stopped will not retry to compact
    // a column family being removed.
    auto tasks_to_stop = make_lw_shared<std::vector<lw_shared_ptr<task>>>();
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`options raid0 devices_discard_performance=Y`