Update seastar submodule

* seastar 18a82e2...8e2f629 (1): > future-utils: fix do_for_each exception reporting Fixes bug during a failed repair.
database: fix potential use-after-free in sstable cleanup
2017-07-06 17:32:37 +03:00 · 2017-07-03 12:49:34 +03:00 · 2017-06-22 08:51:35 +03:00 · 2017-06-21 22:09:31 +03:00 · 2017-06-19 22:32:38 +03:00 · 2017-06-18 12:28:19 +03:00
115 changed files with 3031 additions and 1177 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=1.7.2

 if test -f version
 then
--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -246,7 +246,8 @@ future<> auth::auth::setup() {
        std::map<sstring, sstring> opts;
        opts["replication_factor"] = "1";
        auto ksm = keyspace_metadata::new_keyspace(AUTH_KS, "org.apache.cassandra.locator.SimpleStrategy", opts, true);
-        f = service::get_local_migration_manager().announce_new_keyspace(ksm, false);
+        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments. See issue #2129.
+        f = service::get_local_migration_manager().announce_new_keyspace(ksm, api::min_timestamp, false);
    }

    return f.then([] {
--- a/cell_locking.hh
+++ b/cell_locking.hh
@@ -22,13 +22,28 @@
 #pragma once

 #include <boost/intrusive/unordered_set.hpp>
+
+#if __has_include(<boost/container/small_vector.hpp>)
+
 #include <boost/container/small_vector.hpp>

+template <typename T, size_t N>
+using small_vector = boost::container::small_vector<T, N>;
+
+#else
+
+#include <vector>
+template <typename T, size_t N>
+using small_vector = std::vector<T>;
+
+#endif
+
 #include "fnv1a_hasher.hh"
+#include "streamed_mutation.hh"
 #include "mutation_partition.hh"

 class cells_range {
-    using ids_vector_type = boost::container::small_vector<column_id, 5>;
+    using ids_vector_type = small_vector<column_id, 5>;

    position_in_partition_view _position;
    ids_vector_type _ids;
@@ -147,7 +162,7 @@ class cell_locker {
        // temporarily removed from its parent partition_entry.
        // Returns true if the cell_entry still exist in the new schema and
        // should be reinserted.
-        bool upgrade(const schema& from, const schema& to, column_kind kind) {
+        bool upgrade(const schema& from, const schema& to, column_kind kind) noexcept {
            auto& old_column_mapping = from.get_column_mapping();
            auto& column = old_column_mapping.column_at(kind, _address.id);
            auto cdef = to.get_column_definition(column.name());
@@ -170,7 +185,9 @@ class cell_locker {
        }

        ~cell_entry() {
-            assert(is_linked());
+            if (!is_linked()) {
+                return;
+            }
            unlink();
            if (!--_parent._cell_count) {
                delete &_parent;
@@ -286,10 +303,9 @@ class cell_locker {
        };

        class equal_compare {
-            schema_ptr _schema;
            dht::decorated_key_equals_comparator _cmp;
        public:
-            explicit equal_compare(const schema s) : _cmp(s) { }
+            explicit equal_compare(const schema& s) : _cmp(s) { }
            bool operator()(const dht::decorated_key& dk, const partition_entry& pe) {
                return _cmp(dk, pe._key);
            }
@@ -386,22 +402,19 @@ struct cell_locker::locker {

    partition_cells_range _range;
    partition_cells_range::iterator _current_ck;
-    cells_range _cells_range;
    cells_range::const_iterator _current_cell;

    std::vector<locked_cell> _locks;
 private:
    void update_ck() {
        if (!is_done()) {
-            _cells_range = *_current_ck;
-            _current_cell = _cells_range.begin();
+            _current_cell = _current_ck->begin();
        }
    }

    future<> lock_next();

    bool is_done() const { return _current_ck == _range.end(); }
-    std::vector<locked_cell> get() && { return std::move(_locks); }
 public:
    explicit locker(const ::schema& s, partition_entry& pe, partition_cells_range&& range)
        : _hasher(s)
@@ -413,18 +426,22 @@ public:
        update_ck();
    }

-    future<std::vector<locked_cell>> lock_all() && {
+    locker(const locker&) = delete;
+    locker(locker&&) = delete;
+
+    future<> lock_all() {
        // Cannot defer before first call to lock_next().
        return lock_next().then([this] {
            return do_until([this] { return is_done(); }, [this] {
                return lock_next();
-            }).then([&] {
-                return std::move(*this).get();
            });
        });
    }
+
+    std::vector<locked_cell> get() && { return std::move(_locks); }
 };

+inline
 future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_key& dk, partition_cells_range&& range) {
    partition_entry::hasher pe_hash;
    partition_entry::equal_compare pe_eq(*_schema);
@@ -460,14 +477,17 @@ future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_ke
        return make_ready_future<std::vector<locked_cell>>(std::move(locks));
    }

-    return do_with(locker(*_schema, *it, std::move(range)), [] (auto& locker)  mutable {
-        return std::move(locker).lock_all();
+    auto l = std::make_unique<locker>(*_schema, *it, std::move(range));
+    auto f = l->lock_all();
+    return f.then([l = std::move(l)] {
+        return std::move(*l).get();
    });
 }

+inline
 future<> cell_locker::locker::lock_next() {
    while (!is_done()) {
-        if (_current_cell == _cells_range.end() || _cells_range.empty()) {
+        if (_current_cell == _current_ck->end()) {
            ++_current_ck;
            update_ck();
            continue;
@@ -475,7 +495,7 @@ future<> cell_locker::locker::lock_next() {

        auto cid = *_current_cell++;

-        cell_address ca { position_in_partition(_cells_range.position()), cid };
+        cell_address ca { position_in_partition(_current_ck->position()), cid };
        auto it = _partition_entry.cells().find(ca, _hasher, _eq_cmp);
        if (it != _partition_entry.cells().end()) {
            return it->lock().then([this, ce = it->shared_from_this()] () mutable {
@@ -483,27 +503,25 @@ future<> cell_locker::locker::lock_next() {
            });
        }

-        auto cell = make_lw_shared<cell_entry>(_partition_entry, position_in_partition(_cells_range.position()), cid);
+        auto cell = make_lw_shared<cell_entry>(_partition_entry, position_in_partition(_current_ck->position()), cid);
        _partition_entry.insert(cell);
        _locks.emplace_back(std::move(cell));
    }
    return make_ready_future<>();
 }

+inline
 bool cell_locker::partition_entry::upgrade(schema_ptr new_schema) {
    if (_schema == new_schema) {
        return true;
    }

-    auto buckets = std::make_unique<cells_type::bucket_type[]>(initial_bucket_count);
+    auto buckets = std::make_unique<cells_type::bucket_type[]>(_cells.bucket_count());
    auto cells = cells_type(cells_type::bucket_traits(buckets.get(), _cells.bucket_count()),
                            cell_entry::hasher(*new_schema), cell_entry::equal_compare(*new_schema));

-    while (!_cells.empty()) {
-        auto it = _cells.begin();
-        auto& cell = *it;
-        _cells.erase(it);
-
+    _cells.clear_and_dispose([&] (cell_entry* cell_ptr) noexcept {
+        auto& cell = *cell_ptr;
        auto kind = cell.position().is_static_row() ? column_kind::static_column
                                                    : column_kind::regular_column;
        auto reinsert = cell.upgrade(*_schema, *new_schema, kind);
@@ -512,9 +530,16 @@ bool cell_locker::partition_entry::upgrade(schema_ptr new_schema) {
        } else {
            _cell_count--;
        }
-    }
+    });

+    // bi::unordered_set move assignment is actually a swap.
+    // Original _buckets cannot be destroyed before the container using them is
+    // so we need to explicitly make sure that the original _cells is no more.
    _cells = std::move(cells);
+    auto destroy = [] (auto) { };
+    destroy(std::move(cells));
+
    _buckets = std::move(buckets);
+    _schema = new_schema;
    return _cell_count;
 }
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -788,3 +788,23 @@ commitlog_total_space_in_mb: -1
 # By default, Scylla binds all interfaces to the prometheus API
 # It is possible to restrict the listening address to a specific one
 # prometheus_address: 0.0.0.0
+
+# Distribution of data among cores (shards) within a node
+#
+# Scylla distributes data within a node among shards, using a round-robin
+# strategy:
+#  [shard0] [shard1] ... [shardN-1] [shard0] [shard1] ... [shardN-1] ...
+#
+# Scylla versions 1.6 and below used just one repetition of the pattern;
+# this intefered with data placement among nodes (vnodes).
+#
+# Scylla versions 1.7 and above use 4096 repetitions of the pattern; this
+# provides for better data distribution.
+#
+# the value below is log (base 2) of the number of repetitions.
+#
+# Set to 0 to avoid rewriting all data when upgrading from Scylla 1.6 and
+# below.
+#
+# Keep at 12 for new clusters.
+murmur3_partitioner_ignore_msb_bits: 12
--- a/configure.py
+++ b/configure.py
@@ -230,6 +230,7 @@ scylla_tests = [
    'tests/virtual_reader_test',
    'tests/view_schema_test',
    'tests/counter_test',
+    'tests/cell_locker_test',
 ]

 apps = [
@@ -408,6 +409,7 @@ scylla_core = (['database.cc',
                 'cql3/selection/selector.cc',
                 'cql3/restrictions/statement_restrictions.cc',
                 'cql3/result_set.cc',
+                 'cql3/variable_specifications.cc',
                 'db/consistency_level.cc',
                 'db/system_keyspace.cc',
                 'db/schema_tables.cc',
--- a/counters.cc
+++ b/counters.cc
@@ -139,8 +139,8 @@ stdx::optional<atomic_cell> counter_cell_view::difference(atomic_cell_view a, at
 void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset) {
    // FIXME: allow current_state to be frozen_mutation

-    auto transform_new_row_to_shards = [clock_offset] (auto& cr) {
-        cr.row().cells().for_each_cell([clock_offset] (auto, atomic_cell_or_collection& ac_o_c) {
+    auto transform_new_row_to_shards = [clock_offset] (auto& cells) {
+        cells.for_each_cell([clock_offset] (auto, atomic_cell_or_collection& ac_o_c) {
            auto acv = ac_o_c.as_atomic_cell();
            if (!acv.is_live()) {
                return; // continue -- we are in lambda
@@ -153,32 +153,22 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
    };

    if (!current_state) {
+        transform_new_row_to_shards(m.partition().static_row());
        for (auto& cr : m.partition().clustered_rows()) {
-            transform_new_row_to_shards(cr);
+            transform_new_row_to_shards(cr.row().cells());
        }
        return;
    }

    clustering_key::less_compare cmp(*m.schema());

-    auto& cstate = current_state->partition();
-    auto it = cstate.clustered_rows().begin();
-    auto end = cstate.clustered_rows().end();
-    for (auto& cr : m.partition().clustered_rows()) {
-        while (it != end && cmp(it->key(), cr.key())) {
-            ++it;
-        }
-        if (it == end || cmp(cr.key(), it->key())) {
-            transform_new_row_to_shards(cr);
-            continue;
-        }
-
+    auto transform_row_to_shards = [clock_offset] (auto& transformee, auto& state) {
        struct counter_shard_or_tombstone {
            stdx::optional<counter_shard> shard;
            tombstone tomb;
        };
        std::deque<std::pair<column_id, counter_shard_or_tombstone>> shards;
-        it->row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& ac_o_c) {
+        state.for_each_cell([&] (column_id id, const atomic_cell_or_collection& ac_o_c) {
            auto acv = ac_o_c.as_atomic_cell();
            if (!acv.is_live()) {
                counter_shard_or_tombstone cs_o_t { { },
@@ -194,7 +184,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
            shards.emplace_back(std::make_pair(id, counter_shard_or_tombstone { counter_shard(*cs), tombstone() }));
        });

-        cr.row().cells().for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
+        transformee.for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
            auto acv = ac_o_c.as_atomic_cell();
            if (!acv.is_live()) {
                return; // continue -- we are in lambda
@@ -224,5 +214,22 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
            }
            ac_o_c = ccb.build(acv.timestamp());
        });
+    };
+
+    transform_row_to_shards(m.partition().static_row(), current_state->partition().static_row());
+
+    auto& cstate = current_state->partition();
+    auto it = cstate.clustered_rows().begin();
+    auto end = cstate.clustered_rows().end();
+    for (auto& cr : m.partition().clustered_rows()) {
+        while (it != end && cmp(it->key(), cr.key())) {
+            ++it;
+        }
+        if (it == end || cmp(cr.key(), it->key())) {
+            transform_new_row_to_shards(cr.row().cells());
+            continue;
+        }
+
+        transform_row_to_shards(cr.row().cells(), it->row().cells());
    }
 }
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -67,6 +67,14 @@ functions::init() {
    declare(aggregate_fcts::make_max_function<int64_t>());
    declare(aggregate_fcts::make_min_function<int64_t>());

+    declare(aggregate_fcts::make_count_function<float>());
+    declare(aggregate_fcts::make_max_function<float>());
+    declare(aggregate_fcts::make_min_function<float>());
+
+    declare(aggregate_fcts::make_count_function<double>());
+    declare(aggregate_fcts::make_max_function<double>());
+    declare(aggregate_fcts::make_min_function<double>());
+
    //FIXME:
    //declare(aggregate_fcts::make_count_function<bytes>());
    //declare(aggregate_fcts::make_max_function<bytes>());
@@ -78,15 +86,17 @@ functions::init() {
    declare(make_blob_as_varchar_fct());
    declare(aggregate_fcts::make_sum_function<int32_t>());
    declare(aggregate_fcts::make_sum_function<int64_t>());
-    declare(aggregate_fcts::make_avg_function<int32_t>());
-    declare(aggregate_fcts::make_avg_function<int64_t>());
+    declare(aggregate_fcts::make_sum_function<float>());
+    declare(aggregate_fcts::make_sum_function<double>());
 #if 0
-    declare(AggregateFcts.sumFunctionForFloat);
-    declare(AggregateFcts.sumFunctionForDouble);
    declare(AggregateFcts.sumFunctionForDecimal);
    declare(AggregateFcts.sumFunctionForVarint);
-    declare(AggregateFcts.avgFunctionForFloat);
-    declare(AggregateFcts.avgFunctionForDouble);
+#endif
+    declare(aggregate_fcts::make_avg_function<int32_t>());
+    declare(aggregate_fcts::make_avg_function<int64_t>());
+    declare(aggregate_fcts::make_avg_function<float>());
+    declare(aggregate_fcts::make_avg_function<double>());
+#if 0
    declare(AggregateFcts.avgFunctionForVarint);
    declare(AggregateFcts.avgFunctionForDecimal);
 #endif
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -43,6 +43,7 @@
 #include "schema_builder.hh"
 #include "service/migration_manager.hh"
 #include "boost/range/adaptor/map.hpp"
+#include "stdx.hh"

 namespace cql3 {

@@ -86,14 +87,14 @@ const sstring& alter_type_statement::keyspace() const
    return _name.get_keyspace();
 }

-static int32_t get_idx_of_field(user_type type, shared_ptr<column_identifier> field)
+static stdx::optional<uint32_t> get_idx_of_field(user_type type, shared_ptr<column_identifier> field)
 {
    for (uint32_t i = 0; i < type->field_names().size(); ++i) {
        if (field->name() == type->field_names()[i]) {
-            return i;
+            return {i};
        }
    }
-    return -1;
+    return {};
 }

 void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, bool is_local_only)
@@ -168,7 +169,7 @@ alter_type_statement::add_or_alter::add_or_alter(const ut_name& name, bool is_ad

 user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
 {
-    if (get_idx_of_field(to_update, _field_name) >= 0) {
+    if (get_idx_of_field(to_update, _field_name)) {
        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->name(), _name.to_string()));
    }

@@ -185,19 +186,19 @@ user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_

 user_type alter_type_statement::add_or_alter::do_alter(database& db, user_type to_update) const
 {
-    uint32_t idx = get_idx_of_field(to_update, _field_name);
-    if (idx < 0) {
+    stdx::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
+    if (!idx) {
        throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->name(), _name.to_string()));
    }

-    auto previous = to_update->field_types()[idx];
+    auto previous = to_update->field_types()[*idx];
    auto new_type = _field_type->prepare(db, keyspace())->get_type();
    if (!new_type->is_compatible_with(*previous)) {
        throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->name(), _name.to_string()));
    }

    std::vector<data_type> new_types(to_update->field_types());
-    new_types[idx] = new_type;
+    new_types[*idx] = new_type;
    return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, to_update->field_names(), std::move(new_types));
 }

@@ -221,11 +222,11 @@ user_type alter_type_statement::renames::make_updated_type(database& db, user_ty
    std::vector<bytes> new_names(to_update->field_names());
    for (auto&& rename : _renames) {
        auto&& from = rename.first;
-        int32_t idx = get_idx_of_field(to_update, from);
-        if (idx < 0) {
+        stdx::optional<uint32_t> idx = get_idx_of_field(to_update, from);
+        if (!idx) {
            throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", from->to_string(), _name.to_string()));
        }
-        new_names[idx] = rename.second->name();
+        new_names[*idx] = rename.second->name();
    }
    auto&& updated = user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), to_update->field_types());
    create_type_statement::check_for_duplicate_names(updated);
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -381,8 +381,18 @@ shared_ptr<prepared_statement>
 batch_statement::prepare(database& db, cql_stats& stats) {
    auto&& bound_names = get_bound_variables();

+    stdx::optional<sstring> first_ks;
+    stdx::optional<sstring> first_cf;
+    bool have_multiple_cfs = false;
+
    std::vector<shared_ptr<cql3::statements::modification_statement>> statements;
    for (auto&& parsed : _parsed_statements) {
+        if (!first_ks) {
+            first_ks = parsed->keyspace();
+            first_cf = parsed->column_family();
+        } else {
+            have_multiple_cfs = first_ks.value() != parsed->keyspace() || first_cf.value() != parsed->column_family();
+        }
        statements.push_back(parsed->prepare(db, bound_names, stats));
    }

@@ -392,8 +402,13 @@ batch_statement::prepare(database& db, cql_stats& stats) {
    cql3::statements::batch_statement batch_statement_(bound_names->size(), _type, std::move(statements), std::move(prep_attrs), stats);
    batch_statement_.validate();

+    std::vector<uint16_t> partition_key_bind_indices;
+    if (!have_multiple_cfs && batch_statement_.get_statements().size() > 0) {
+        partition_key_bind_indices = bound_names->get_partition_key_bind_indexes(batch_statement_.get_statements()[0]->s);
+    }
    return ::make_shared<prepared>(make_shared(std::move(batch_statement_)),
-                                                     bound_names->get_specifications());
+                                                     bound_names->get_specifications(),
+                                                     std::move(partition_key_bind_indices));
 }

 }
--- a/cql3/statements/drop_type_statement.cc
+++ b/cql3/statements/drop_type_statement.cc
@@ -79,6 +79,57 @@ void drop_type_statement::validate(distributed<service::storage_proxy>& proxy, c
                throw exceptions::invalid_request_exception(sprint("No user type named %s exists.", _name.to_string()));
            }
        }
+
+        // We don't want to drop a type unless it's not used anymore (mainly because
+        // if someone drops a type and recreates one with the same name but different
+        // definition with the previous name still in use, things can get messy).
+        // We have two places to check: 1) other user type that can nest the one
+        // we drop and 2) existing tables referencing the type (maybe in a nested
+        // way).
+
+        // This code is moved from schema_keyspace (akin to origin) because we cannot
+        // delay this check to until after we've applied the mutations. If a type or
+        // table references the type we're dropping, we will a.) get exceptions parsing
+        // (can be translated to invalid_request, but...) and more importantly b.)
+        // we will leave those types/tables in a broken state.
+        // We managed to get through this before because we neither enforced hard
+        // cross reference between types when loading them, nor did we in fact
+        // probably ever run the scenario of dropping a referenced type and then
+        // actually using the referee.
+        //
+        // Now, this has a giant flaw. We are succeptible to race conditions here,
+        // since we could have a drop at the same time as a create type that references
+        // the dropped one, but we complete the check before the create is done,
+        // yet apply the drop mutations after -> inconsistent data!
+        // This problem is the same in origin, and I see no good way around it
+        // as long as the atomicity of schema modifications are based on
+        // actual appy of mutations, because unlike other drops, this one isn't
+        // benevolent.
+        // I guess this is one case where user need beware, and don't mess with types
+        // concurrently!
+
+        auto&& type = old->second;
+        auto&& keyspace = type->_keyspace;
+        auto&& name = type->_name;
+
+        for (auto&& ut : all_types | boost::adaptors::map_values) {
+            if (ut->_keyspace == keyspace && ut->_name == name) {
+                continue;
+            }
+
+            if (ut->references_user_type(keyspace, name)) {
+                throw exceptions::invalid_request_exception(sprint("Cannot drop user type %s.%s as it is still used by user type %s", keyspace, type->get_name_as_string(), ut->get_name_as_string()));
+            }
+        }
+
+        for (auto&& cfm : ks.metadata()->cf_meta_data() | boost::adaptors::map_values) {
+            for (auto&& col : cfm->all_columns()) {
+                if (col.second->type->references_user_type(keyspace, name)) {
+                    throw exceptions::invalid_request_exception(sprint("Cannot drop user type %s.%s as it is still used by table %s.%s", keyspace, type->get_name_as_string(), cfm->ks_name(), cfm->cf_name()));
+                }
+            }
+        }
+
    } catch (no_such_keyspace& e) {
        throw exceptions::invalid_request_exception(sprint("Cannot drop type in unknown keyspace %s", keyspace()));
    }
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -597,9 +597,11 @@ namespace raw {

 ::shared_ptr<prepared_statement>
 modification_statement::modification_statement::prepare(database& db, cql_stats& stats) {
+    schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());
    auto bound_names = get_bound_variables();
    auto statement = prepare(db, bound_names, stats);
-    return ::make_shared<prepared>(std::move(statement), *bound_names);
+    auto partition_key_bind_indices = bound_names->get_partition_key_bind_indexes(schema);
+    return ::make_shared<prepared>(std::move(statement), *bound_names, std::move(partition_key_bind_indices));
 }

 ::shared_ptr<cql3::statements::modification_statement>
--- a/cql3/statements/parsed_statement.cc
+++ b/cql3/statements/parsed_statement.cc
@@ -67,21 +67,22 @@ bool parsed_statement::uses_function(const sstring& ks_name, const sstring& func

 }

-prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_)
+prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_, std::vector<uint16_t> partition_key_bind_indices)
    : statement(std::move(statement_))
    , bound_names(std::move(bound_names_))
+    , partition_key_bind_indices(std::move(partition_key_bind_indices))
 { }

-prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, const variable_specifications& names)
-    : prepared_statement(statement_, names.get_specifications())
+prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, const variable_specifications& names, const std::vector<uint16_t>& partition_key_bind_indices)
+    : prepared_statement(statement_, names.get_specifications(), partition_key_bind_indices)
 { }

-prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, variable_specifications&& names)
-    : prepared_statement(statement_, std::move(names).get_specifications())
+prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, variable_specifications&& names, std::vector<uint16_t>&& partition_key_bind_indices)
+    : prepared_statement(statement_, std::move(names).get_specifications(), std::move(partition_key_bind_indices))
 { }

 prepared_statement::prepared_statement(::shared_ptr<cql_statement>&& statement_)
-    : prepared_statement(statement_, std::vector<::shared_ptr<column_specification>>())
+    : prepared_statement(statement_, std::vector<::shared_ptr<column_specification>>(), std::vector<uint16_t>())
 { }

 }
--- a/cql3/statements/prepared_statement.hh
+++ b/cql3/statements/prepared_statement.hh
@@ -60,12 +60,13 @@ public:
    sstring raw_cql_statement;
    const ::shared_ptr<cql_statement> statement;
    const std::vector<::shared_ptr<column_specification>> bound_names;
+    std::vector<uint16_t> partition_key_bind_indices;

-    prepared_statement(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_);
+    prepared_statement(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_, std::vector<uint16_t> partition_key_bind_indices);

-    prepared_statement(::shared_ptr<cql_statement> statement_, const variable_specifications& names);
+    prepared_statement(::shared_ptr<cql_statement> statement_, const variable_specifications& names, const std::vector<uint16_t>& partition_key_bind_indices);

-    prepared_statement(::shared_ptr<cql_statement> statement_, variable_specifications&& names);
+    prepared_statement(::shared_ptr<cql_statement> statement_, variable_specifications&& names, std::vector<uint16_t>&& partition_key_bind_indices);

    prepared_statement(::shared_ptr<cql_statement>&& statement_);
 };
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -445,7 +445,9 @@ select_statement::select_statement(::shared_ptr<cf_name> cf_name,
        prepare_limit(db, bound_names),
        stats);

-    return ::make_shared<prepared>(std::move(stmt), std::move(*bound_names));
+    auto partition_key_bind_indices = bound_names->get_partition_key_bind_indexes(schema);
+
+    return ::make_shared<prepared>(std::move(stmt), std::move(*bound_names), std::move(partition_key_bind_indices));
 }

 ::shared_ptr<restrictions::statement_restrictions>
--- a/cql3/variable_specifications.cc
+++ b/cql3/variable_specifications.cc
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cql3/variable_specifications.hh"
+
+namespace cql3 {
+
+variable_specifications::variable_specifications(const std::vector<::shared_ptr<column_identifier>>& variable_names)
+    : _variable_names{variable_names}
+    , _specs{variable_names.size()}
+    , _target_columns{variable_names.size()}
+{ }
+
+::shared_ptr<variable_specifications> variable_specifications::empty() {
+    return ::make_shared<variable_specifications>(std::vector<::shared_ptr<column_identifier>>{});
+}
+
+size_t variable_specifications::size() const {
+    return _variable_names.size();
+}
+
+std::vector<::shared_ptr<column_specification>> variable_specifications::get_specifications() const & {
+    return std::vector<::shared_ptr<column_specification>>(_specs.begin(), _specs.end());
+}
+
+std::vector<::shared_ptr<column_specification>> variable_specifications::get_specifications() && {
+    return std::move(_specs);
+}
+
+std::vector<uint16_t> variable_specifications::get_partition_key_bind_indexes(schema_ptr schema) const {
+    auto count = schema->partition_key_columns().size();
+    std::vector<uint16_t> partition_key_positions(count, uint16_t(0));
+    std::vector<bool> set(count, false);
+    for (size_t i = 0; i < _target_columns.size(); i++) {
+        auto& target_column = _target_columns[i];
+        const auto* cdef = schema->get_column_definition(target_column->name->name());
+        if (cdef && cdef->is_partition_key()) {
+            partition_key_positions[cdef->position()] = i;
+            set[cdef->position()] = true;
+        }
+    }
+    for (bool b : set) {
+        if (!b) {
+            return {};
+        }
+    }
+    return partition_key_positions;
+}
+
+void variable_specifications::add(int32_t bind_index, ::shared_ptr<column_specification> spec) {
+    _target_columns[bind_index] = spec;
+    auto name = _variable_names[bind_index];
+    // Use the user name, if there is one
+    if (name) {
+        spec = ::make_shared<column_specification>(spec->ks_name, spec->cf_name, name, spec->type);
+    }
+    _specs[bind_index] = spec;
+}
+
+}
--- a/cql3/variable_specifications.hh
+++ b/cql3/variable_specifications.hh
@@ -53,41 +53,26 @@ class variable_specifications final {
 private:
    std::vector<shared_ptr<column_identifier>> _variable_names;
    std::vector<::shared_ptr<column_specification>> _specs;
+    std::vector<::shared_ptr<column_specification>> _target_columns;

 public:
-    variable_specifications(const std::vector<::shared_ptr<column_identifier>>& variable_names)
-        : _variable_names{variable_names}
-        , _specs{variable_names.size()}
-    { }
+    variable_specifications(const std::vector<::shared_ptr<column_identifier>>& variable_names);

    /**
     * Returns an empty instance of <code>VariableSpecifications</code>.
     * @return an empty instance of <code>VariableSpecifications</code>
     */
-    static ::shared_ptr<variable_specifications> empty() {
-        return ::make_shared<variable_specifications>(std::vector<::shared_ptr<column_identifier>>{});
-    }
+    static ::shared_ptr<variable_specifications> empty();

-    size_t size() const {
-        return _variable_names.size();
-    }
+    size_t size() const;

-    std::vector<::shared_ptr<column_specification>> get_specifications() const & {
-        return std::vector<::shared_ptr<column_specification>>(_specs.begin(), _specs.end());
-    }
+    std::vector<::shared_ptr<column_specification>> get_specifications() const &;

-    std::vector<::shared_ptr<column_specification>> get_specifications() && {
-        return std::move(_specs);
-    }
+    std::vector<::shared_ptr<column_specification>> get_specifications() &&;

-    void add(int32_t bind_index, ::shared_ptr<column_specification> spec) {
-        auto name = _variable_names[bind_index];
-        // Use the user name, if there is one
-        if (name) {
-            spec = ::make_shared<column_specification>(spec->ks_name, spec->cf_name, name, spec->type);
-        }
-        _specs[bind_index] = spec;
-    }
+    std::vector<uint16_t> get_partition_key_bind_indexes(schema_ptr schema) const;
+
+    void add(int32_t bind_index, ::shared_ptr<column_specification> spec);
 };

 }
--- a/database.cc
+++ b/database.cc
@@ -1082,6 +1082,7 @@ column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
                return make_ready_future<stop_iteration>(stop_iteration::yes);
            });
        } catch (...) {
+            newtab->mark_for_deletion();
            dblog.error("failed to write sstable {}: {}", newtab->get_filename(), std::current_exception());
            // If we failed this write we will try the write again and that will create a new flush reader
            // that will decrease dirty memory again. So we need to reset the accounting.
@@ -1250,7 +1251,7 @@ void column_family::rebuild_statistics() {
                    // making the two ranges compatible when compiling with boost 1.55.
                    // Noone is actually moving anything...
                                         std::move(*_sstables->all()))) {
-        update_stats_for_new_sstable(tab->data_size(), tab->get_shards_for_this_sstable());
+        update_stats_for_new_sstable(tab->bytes_on_disk(), tab->get_shards_for_this_sstable());
    }
 }

@@ -1357,7 +1358,7 @@ column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool
 }

 static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,
-                   const lw_shared_ptr<dht::token_range_vector>& owned_ranges,
+                   const dht::token_range_vector& owned_ranges,
                   schema_ptr s) {
    auto first = sst->get_first_partition_key();
    auto last = sst->get_last_partition_key();
@@ -1366,7 +1367,7 @@ static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,
    dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);

    // return true iff sst partition range isn't fully contained in any of the owned ranges.
-    for (auto& r : *owned_ranges) {
+    for (auto& r : owned_ranges) {
        if (r.contains(sst_token_range, dht::token_comparator())) {
            return false;
        }
@@ -1376,17 +1377,24 @@ static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,

 future<> column_family::cleanup_sstables(sstables::compaction_descriptor descriptor) {
    dht::token_range_vector r = service::get_local_storage_service().get_local_ranges(_schema->ks_name());
-    auto owned_ranges = make_lw_shared<dht::token_range_vector>(std::move(r));
-    auto sstables_to_cleanup = make_lw_shared<std::vector<sstables::shared_sstable>>(std::move(descriptor.sstables));

-    return parallel_for_each(*sstables_to_cleanup, [this, owned_ranges = std::move(owned_ranges), sstables_to_cleanup] (auto& sst) {
-        if (!owned_ranges->empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
+  return do_with(std::move(descriptor.sstables), std::move(r), [this] (auto& sstables, auto& owned_ranges) {
+    return do_for_each(sstables, [this, &owned_ranges] (auto& sst) {
+        if (!owned_ranges.empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
           return make_ready_future<>();
        }

-        std::vector<sstables::shared_sstable> sstable_to_compact({ sst });
-        return this->compact_sstables(sstables::compaction_descriptor(std::move(sstable_to_compact), sst->get_sstable_level()), true);
+        // this semaphore ensures that only one cleanup will run per shard.
+        // That's to prevent node from running out of space when almost all sstables
+        // need cleanup, so if sstables are cleaned in parallel, we may need almost
+        // twice the disk space used by those sstables.
+        static thread_local semaphore sem(1);
+
+        return with_semaphore(sem, 1, [this, &sst] {
+            return this->compact_sstables(sstables::compaction_descriptor({ sst }, sst->get_sstable_level()), true);
+        });
    });
+  });
 }

 // FIXME: this is just an example, should be changed to something more general
@@ -1525,16 +1533,19 @@ future<> distributed_loader::open_sstable(distributed<database>& db, sstables::e

    return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
            [&db, comps = std::move(comps), func = std::move(func)] (database& local) {
-        auto& cf = local.find_column_family(comps.ks, comps.cf);

-        auto f = sstables::sstable::load_shared_components(cf.schema(), cf._config.datadir, comps.generation, comps.version, comps.format);
-        return f.then([&db, comps = std::move(comps), func = std::move(func)] (sstables::sstable_open_info info) {
-            // shared components loaded, now opening sstable in all shards with shared components
-            return do_with(std::move(info), [&db, comps = std::move(comps), func = std::move(func)] (auto& info) {
-                return invoke_all_with_ptr(db, std::move(info.components),
-                        [owners = info.owners, data = info.data.dup(), index = info.index.dup(), comps, func] (database& db, auto components) {
-                    auto& cf = db.find_column_family(comps.ks, comps.cf);
-                    return func(cf, sstables::foreign_sstable_open_info{std::move(components), owners, data, index});
+        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func)] {
+            auto& cf = local.find_column_family(comps.ks, comps.cf);
+
+            auto f = sstables::sstable::load_shared_components(cf.schema(), cf._config.datadir, comps.generation, comps.version, comps.format);
+            return f.then([&db, comps = std::move(comps), func = std::move(func)] (sstables::sstable_open_info info) {
+                // shared components loaded, now opening sstable in all shards with shared components
+                return do_with(std::move(info), [&db, comps = std::move(comps), func = std::move(func)] (auto& info) {
+                    return invoke_all_with_ptr(db, std::move(info.components),
+                            [owners = info.owners, data = info.data.dup(), index = info.index.dup(), comps, func] (database& db, auto components) {
+                        auto& cf = db.find_column_family(comps.ks, comps.cf);
+                        return func(cf, sstables::foreign_sstable_open_info{std::move(components), owners, data, index});
+                    });
                });
            });
        });
@@ -1706,7 +1717,7 @@ future<> distributed_loader::populate_column_family(distributed<database>& db, s
                return make_ready_future<>();
            });
        }).then([verifier, sstdir, descriptor, ks = std::move(ks), cf = std::move(cf)] {
-            return parallel_for_each(*verifier, [sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), descriptor] (auto v) {
+            return do_for_each(*verifier, [sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), descriptor, verifier] (auto v) {
                if (v.second == status::has_temporary_toc_file) {
                    unsigned long gen = v.first;
                    assert(descriptor->version);
@@ -1745,9 +1756,9 @@ database::database(const db::config& cfg)
    : _stats(make_lw_shared<db_stats>())
    , _cfg(std::make_unique<db::config>(cfg))
    // Allow system tables a pool of 10 MB memory to write, but never block on other regions.
-    , _system_dirty_memory_manager(*this, 10 << 20)
-    , _dirty_memory_manager(*this, memory::stats().total_memory() * 0.45)
-    , _streaming_dirty_memory_manager(*this, memory::stats().total_memory() * 0.10)
+    , _system_dirty_memory_manager(*this, 10 << 20, cfg.virtual_dirty_soft_limit())
+    , _dirty_memory_manager(*this, memory::stats().total_memory() * 0.45, cfg.virtual_dirty_soft_limit())
+    , _streaming_dirty_memory_manager(*this, memory::stats().total_memory() * 0.10, cfg.virtual_dirty_soft_limit())
    , _version(empty_version)
    , _enable_incremental_backups(cfg.incremental_backups())
 {
@@ -1802,7 +1813,7 @@ database::setup_metrics() {
    });

    _metrics.add_group("database", {
-        sm::make_gauge("requests_blocked_memory", [this] { return _dirty_memory_manager.region_group().blocked_requests(); },
+        sm::make_gauge("requests_blocked_memory_current", [this] { return _dirty_memory_manager.region_group().blocked_requests(); },
                       sm::description(
                           seastar::format("Holds the current number of requests blocked due to reaching the memory quota ({}B). "
                                           "Non-zero value indicates that our bottleneck is memory and more specifically - the memory quota allocated for the \"database\" component.", _dirty_memory_manager.throttle_threshold()))),
@@ -2663,7 +2674,7 @@ column_family::apply(const frozen_mutation& m, const schema_ptr& m_schema, const
    do_apply(m, m_schema, rp);
 }

-future<frozen_mutation> database::do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema) {
+future<frozen_mutation> database::do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema, timeout_clock::time_point timeout) {
    auto m = fm.unfreeze(m_schema);
    m.upgrade(cf.schema());

@@ -2689,9 +2700,9 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
        cql_serialization_format::internal(), query::max_rows);

    return do_with(std::move(slice), std::move(m), std::vector<locked_cell>(), stdx::optional<frozen_mutation>(),
-                   [this, &cf] (const query::partition_slice& slice, mutation& m, std::vector<locked_cell>& locks,
+                   [this, &cf, timeout] (const query::partition_slice& slice, mutation& m, std::vector<locked_cell>& locks,
                               stdx::optional<frozen_mutation>& fm) mutable {
-        return cf.lock_counter_cells(m).then([&, m_schema = cf.schema(), this] (std::vector<locked_cell> lcs) {
+        return cf.lock_counter_cells(m).then([&, timeout, m_schema = cf.schema(), this] (std::vector<locked_cell> lcs) {
            locks = std::move(lcs);

            // Before counter update is applied it needs to be transformed from
@@ -2702,7 +2713,7 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
            return mutation_query(m_schema, cf.as_mutation_source({}),
                                  dht::partition_range::make_singular(m.decorated_key()),
                                  slice, query::max_rows, query::max_partitions,
-                                  gc_clock::now(), { }).then([this, &cf, &m, &fm, m_schema] (auto result) {
+                                  gc_clock::now(), { }).then([this, timeout, &cf, &m, &fm, m_schema] (auto result) {

                // ...now, that we got existing state of all affected counter
                // cells we can look for our shard in each of them, increment
@@ -2714,9 +2725,8 @@ future<frozen_mutation> database::do_apply_counter_update(column_family& cf, con
                transform_counter_updates_to_shards(m, mopt ? &*mopt : nullptr, cf.failed_counter_applies_to_memtable());

                // FIXME: oh dear, another freeze
-                // FIXME: timeout
                fm = freeze(m);
-                return this->do_apply(m_schema, *fm, { });
+                return this->do_apply(m_schema, *fm, timeout);
            }).then([&fm] {
                return std::move(*fm);
            });
@@ -2854,7 +2864,7 @@ future<> dirty_memory_manager::flush_when_needed() {
    });
 }

-void dirty_memory_manager::start_reclaiming() {
+void dirty_memory_manager::start_reclaiming() noexcept {
    _should_flush.signal();
 }

@@ -2876,7 +2886,7 @@ future<frozen_mutation> database::apply_counter_update(schema_ptr s, const froze
    }
    try {
        auto& cf = find_column_family(m.column_family_id());
-        return do_apply_counter_update(cf, m, s);
+        return do_apply_counter_update(cf, m, s, timeout);
    } catch (no_such_column_family&) {
        dblog.error("Attempting to mutate non-existent table {}", m.column_family_id());
        throw;
@@ -3103,6 +3113,10 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
                }
                return f.then([&cf, truncated_at] {
                    return cf.discard_sstables(truncated_at).then([&cf, truncated_at](db::replay_position rp) {
+                        // TODO: verify that rp == db::replay_position is because we have no sstables (and no data flushed)
+                        if (rp == db::replay_position()) {
+                            return make_ready_future();
+                        }
                        // TODO: indexes.
                        return db::system_keyspace::save_truncation_record(cf, truncated_at, rp);
                    });
--- a/database.hh
+++ b/database.hh
@@ -149,7 +149,7 @@ class dirty_memory_manager: public logalloc::region_group_reclaimer {
    std::unordered_map<const logalloc::region*, flush_permit> _flush_manager;

    future<> _waiting_flush;
-    virtual void start_reclaiming() override;
+    virtual void start_reclaiming() noexcept override;

    bool has_pressure() const {
        return over_soft_limit();
@@ -193,8 +193,8 @@ public:
    //
    // We then set the soft limit to 80 % of the virtual dirty hard limit, which is equal to 40 % of
    // the user-supplied threshold.
-    dirty_memory_manager(database& db, size_t threshold)
-        : logalloc::region_group_reclaimer(threshold / 2, threshold * 0.40)
+    dirty_memory_manager(database& db, size_t threshold, double soft_limit)
+        : logalloc::region_group_reclaimer(threshold / 2, threshold * soft_limit / 2)
        , _db(&db)
        , _region_group(*this)
        , _flush_serializer(1)
@@ -1076,6 +1076,7 @@ private:
    ::cf_stats _cf_stats;
    static constexpr size_t max_concurrent_reads() { return 100; }
    static constexpr size_t max_system_concurrent_reads() { return 10; }
+    static constexpr size_t max_concurrent_sstable_loads() { return 3; }
    struct db_stats {
        uint64_t total_writes = 0;
        uint64_t total_writes_failed = 0;
@@ -1101,6 +1102,8 @@ private:
    semaphore _system_read_concurrency_sem{max_system_concurrent_reads()};
    restricted_mutation_reader_config _system_read_concurrency_config;

+    semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};
+
    std::unordered_map<sstring, keyspace> _keyspaces;
    std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
    std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash> _ks_cf_to_uuid;
@@ -1126,7 +1129,7 @@ private:

    query::result_memory_limiter _result_memory_limiter;

-    future<frozen_mutation> do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema);
+    future<frozen_mutation> do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema, timeout_clock::time_point timeout);
 public:
    static utils::UUID empty_version;

@@ -1257,6 +1260,9 @@ public:
    semaphore& system_keyspace_read_concurrency_sem() {
        return _system_read_concurrency_sem;
    }
+    semaphore& sstable_load_concurrency_sem() {
+        return _sstable_load_concurrency_sem;
+    }

    friend class distributed_loader;
 };
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1588,7 +1588,7 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
        bool failed = false;

        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f, o, make_file_input_stream_options())), start_off(o) {
+                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
        }
        work(work&&) = default;

--- a/db/commitlog/commitlog_entry.cc
+++ b/db/commitlog/commitlog_entry.cc
@@ -34,48 +34,26 @@
 #include "idl/mutation.dist.impl.hh"
 #include "idl/commitlog.dist.impl.hh"

-commitlog_entry::commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
-    : _mapping(std::move(mapping))
-      , _mutation_storage(std::move(mutation))
-      , _mutation(*_mutation_storage)
-{ }
-
-commitlog_entry::commitlog_entry(stdx::optional<column_mapping> mapping, const frozen_mutation& mutation)
-    : _mapping(std::move(mapping))
-      , _mutation(mutation)
-{ }
-
-commitlog_entry::commitlog_entry(commitlog_entry&& ce)
-    : _mapping(std::move(ce._mapping))
-    , _mutation_storage(std::move(ce._mutation_storage))
-    , _mutation(_mutation_storage ? *_mutation_storage : ce._mutation)
-{
-}
-
-commitlog_entry& commitlog_entry::operator=(commitlog_entry&& ce)
-{
-    if (this != &ce) {
-        this->~commitlog_entry();
-        new (this) commitlog_entry(std::move(ce));
-    }
-    return *this;
-}
-
-commitlog_entry commitlog_entry_writer::get_entry() const {
-    if (_with_schema) {
-        return commitlog_entry(_schema->get_column_mapping(), _mutation);
-    } else {
-        return commitlog_entry({}, _mutation);
-    }
+template<typename Output>
+void commitlog_entry_writer::serialize(Output& out) const {
+    [this, wr = ser::writer_of_commitlog_entry<Output>(out)] () mutable {
+        if (_with_schema) {
+            return std::move(wr).write_mapping(_schema->get_column_mapping());
+        } else {
+            return std::move(wr).skip_mapping();
+        }
+    }().write_mutation(_mutation).end_commitlog_entry();
 }

 void commitlog_entry_writer::compute_size() {
-    _size = ser::get_sizeof(get_entry());
+    seastar::measuring_output_stream ms;
+    serialize(ms);
+    _size = ms.size();
 }

 void commitlog_entry_writer::write(data_output& out) const {
    seastar::simple_output_stream str(out.reserve(size()), size());
-    ser::serialize(str, get_entry());
+    serialize(str);
 }

 commitlog_entry_reader::commitlog_entry_reader(const temporary_buffer<char>& buffer)
--- a/db/commitlog/commitlog_entry.hh
+++ b/db/commitlog/commitlog_entry.hh
@@ -31,15 +31,10 @@ namespace stdx = std::experimental;

 class commitlog_entry {
    stdx::optional<column_mapping> _mapping;
-    stdx::optional<frozen_mutation> _mutation_storage;
-    const frozen_mutation& _mutation;
+    frozen_mutation _mutation;
 public:
-    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation);
-    commitlog_entry(stdx::optional<column_mapping> mapping, const frozen_mutation& mutation);
-    commitlog_entry(commitlog_entry&&);
-    commitlog_entry(const commitlog_entry&) = delete;
-    commitlog_entry& operator=(commitlog_entry&&);
-    commitlog_entry& operator=(const commitlog_entry&) = delete;
+    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
+        : _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
    const stdx::optional<column_mapping>& mapping() const { return _mapping; }
    const frozen_mutation& mutation() const { return _mutation; }
 };
@@ -50,8 +45,9 @@ class commitlog_entry_writer {
    bool _with_schema = true;
    size_t _size;
 private:
+    template<typename Output>
+    void serialize(Output&) const;
    void compute_size();
-    commitlog_entry get_entry() const;
 public:
    commitlog_entry_writer(schema_ptr s, const frozen_mutation& fm)
        : _schema(std::move(s)), _mutation(fm)
@@ -88,4 +84,4 @@ public:

    const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
    const frozen_mutation& mutation() const { return _ce.mutation(); }
-};
+};
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -61,13 +61,19 @@

 static logging::logger logger("commitlog_replayer");

-struct column_mappings {
-    std::unordered_map<table_schema_version, column_mapping> map;
-    future<> stop() { return make_ready_future<>(); }
-};
-
 class db::commitlog_replayer::impl {
-    seastar::sharded<column_mappings> _column_mappings;
+    struct column_mappings {
+        std::unordered_map<table_schema_version, column_mapping> map;
+        future<> stop() { return make_ready_future<>(); }
+    };
+
+    // we want the processing methods to be const, since they use
+    // shard-sharing of data -> read only
+    // this one is special since it is thread local.
+    // Should actually make sharded::local a const function (it does
+    // not modify content), but...
+    mutable seastar::sharded<column_mappings> _column_mappings;
+
    friend class db::commitlog_replayer;
 public:
    impl(seastar::sharded<cql3::query_processor>& db);
@@ -94,13 +100,35 @@ public:
        }
    };

-    future<> process(stats*, temporary_buffer<char> buf, replay_position rp);
-    future<stats> recover(sstring file);
+    // move start/stop of the thread local bookkeep to "top level"
+    // and also make sure to assert on it actually being started.
+    future<> start() {
+        return _column_mappings.start();
+    }
+    future<> stop() {
+        return _column_mappings.stop();
+    }
+
+    future<> process(stats*, temporary_buffer<char> buf, replay_position rp) const;
+    future<stats> recover(sstring file) const;

    typedef std::unordered_map<utils::UUID, replay_position> rp_map;
    typedef std::unordered_map<unsigned, rp_map> shard_rpm_map;
    typedef std::unordered_map<unsigned, replay_position> shard_rp_map;

+    replay_position min_pos(unsigned shard) const {
+        auto i = _min_pos.find(shard);
+        return i != _min_pos.end() ? i->second : replay_position();
+    }
+    replay_position cf_min_pos(const utils::UUID& uuid, unsigned shard) const {
+        auto i = _rpm.find(shard);
+        if (i == _rpm.end()) {
+            return replay_position();
+        }
+        auto j = i->second.find(uuid);
+        return j != i->second.end() ? j->second : replay_position();
+    }
+
    seastar::sharded<cql3::query_processor>&
        _qp;
    shard_rpm_map
@@ -175,7 +203,6 @@ future<> db::commitlog_replayer::impl::init() {
                }
            }
        }
-
        for (auto&p : _min_pos) {
            logger.debug("minimum position for shard {}: {}", p.first, p.second);
        }
@@ -188,9 +215,11 @@ future<> db::commitlog_replayer::impl::init() {
 }

 future<db::commitlog_replayer::impl::stats>
-db::commitlog_replayer::impl::recover(sstring file) {
+db::commitlog_replayer::impl::recover(sstring file) const {
+    assert(_column_mappings.local_is_initialized());
+
    replay_position rp{commitlog::descriptor(file)};
-    auto gp = _min_pos[rp.shard_id()];
+    auto gp = min_pos(rp.shard_id());

    if (rp.id < gp.id) {
        logger.debug("skipping replay of fully-flushed {}", file);
@@ -220,7 +249,7 @@ db::commitlog_replayer::impl::recover(sstring file) {
    });
 }

-future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char> buf, replay_position rp) {
+future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char> buf, replay_position rp) const {
    try {

        commitlog_entry_reader cer(buf);
@@ -238,17 +267,16 @@ future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char>
        const column_mapping& src_cm = cm_it->second;

        auto shard_id = rp.shard_id();
-        if (rp < _min_pos[shard_id]) {
+        if (rp < min_pos(shard_id)) {
            logger.trace("entry {} is less than global min position. skipping", rp);
            s->skipped_mutations++;
            return make_ready_future<>();
        }

        auto uuid = fm.column_family_id();
-        auto& map = _rpm[shard_id];
-        auto i = map.find(uuid);
-        if (i != map.end() && rp <= i->second) {
-            logger.trace("entry {} at {} is younger than recorded replay position {}. skipping", fm.column_family_id(), rp, i->second);
+        auto cf_rp = cf_min_pos(uuid, shard_id);
+        if (rp <= cf_rp) {
+            logger.trace("entry {} at {} is younger than recorded replay position {}. skipping", fm.column_family_id(), rp, cf_rp);
            s->skipped_mutations++;
            return make_ready_future<>();
        }
@@ -323,42 +351,55 @@ future<db::commitlog_replayer> db::commitlog_replayer::create_replayer(seastar::
 }

 future<> db::commitlog_replayer::recover(std::vector<sstring> files) {
-  return _impl->_column_mappings.start().then([this, files = std::move(files)] {
+    typedef std::unordered_multimap<unsigned, sstring> shard_file_map;
+
    logger.info("Replaying {}", join(", ", files));
-    return map_reduce(files, [this](auto f) {
-        logger.debug("Replaying {}", f);
-        return _impl->recover(f).then([f](impl::stats stats) {
-            if (stats.corrupt_bytes != 0) {
-                logger.warn("Corrupted file: {}. {} bytes skipped.", f, stats.corrupt_bytes);
-            }
-            logger.debug("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
-                            , f
-                            , stats.applied_mutations
-                            , stats.invalid_mutations
-                            , stats.skipped_mutations
+
+    // pre-compute work per shard already.
+    auto map = ::make_lw_shared<shard_file_map>();
+    for (auto& f : files) {
+        commitlog::descriptor d(f);
+        replay_position p = d;
+        map->emplace(p.shard_id() % smp::count, std::move(f));
+    }
+
+    return _impl->start().then([this, map] {
+        return map_reduce(smp::all_cpus(), [this, map](unsigned id) {
+            return smp::submit_to(id, [this, id, map]() {
+                auto total = ::make_lw_shared<impl::stats>();
+                // TODO: or something. For now, we do this serialized per shard,
+                // to reduce mutation congestion. We could probably (says avi)
+                // do 2 segments in parallel or something, but lets use this first.
+                auto range = map->equal_range(id);
+                return do_for_each(range.first, range.second, [this, total](const std::pair<unsigned, sstring>& p) {
+                    auto&f = p.second;
+                    logger.debug("Replaying {}", f);
+                    return _impl->recover(f).then([f, total](impl::stats stats) {
+                        if (stats.corrupt_bytes != 0) {
+                            logger.warn("Corrupted file: {}. {} bytes skipped.", f, stats.corrupt_bytes);
+                        }
+                        logger.debug("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
+                                        , f
+                                        , stats.applied_mutations
+                                        , stats.invalid_mutations
+                                        , stats.skipped_mutations
+                        );
+                        *total += stats;
+                    });
+                }).then([total] {
+                    return make_ready_future<impl::stats>(*total);
+                });
+            });
+        }, impl::stats(), std::plus<impl::stats>()).then([](impl::stats totals) {
+            logger.info("Log replay complete, {} replayed mutations ({} invalid, {} skipped)"
+                            , totals.applied_mutations
+                            , totals.invalid_mutations
+                            , totals.skipped_mutations
            );
-            return make_ready_future<impl::stats>(stats);
-        }).handle_exception([f](auto ep) -> future<impl::stats> {
-            logger.error("Error recovering {}: {}", f, ep);
-            try {
-                std::rethrow_exception(ep);
-            } catch (std::invalid_argument&) {
-                logger.error("Scylla cannot process {}. Make sure to fully flush all Cassandra commit log files to sstable before migrating.", f);
-                throw;
-            } catch (...) {
-                throw;
-            }
        });
-    }, impl::stats(), std::plus<impl::stats>()).then([](impl::stats totals) {
-        logger.info("Log replay complete, {} replayed mutations ({} invalid, {} skipped)"
-                        , totals.applied_mutations
-                        , totals.invalid_mutations
-                        , totals.skipped_mutations
-        );
    }).finally([this] {
-        return _impl->_column_mappings.stop();
+        return _impl->stop();
    });
-  });
 }

 future<> db::commitlog_replayer::recover(sstring f) {
--- a/db/config.hh
+++ b/db/config.hh
@@ -739,6 +739,7 @@ public:
    val(prometheus_prefix, sstring, "scylla", Used, "Set the prefix of the exported Prometheus metrics. Changing this will break Scylla's dashboard compatibility, do not change unless you know what you are doing.") \
    val(abort_on_lsa_bad_alloc, bool, false, Used, "Abort when allocation in LSA region fails") \
    val(murmur3_partitioner_ignore_msb_bits, unsigned, 0, Used, "Number of most siginificant token bits to ignore in murmur3 partitioner; increase for very large clusters") \
+    val(virtual_dirty_soft_limit, double, 0.6, Used, "Soft limit of virtual dirty memory expressed as a portion of the hard limit") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -77,6 +77,15 @@ namespace schema_tables {

 logging::logger logger("schema_tables");

+struct push_back_and_return {
+    std::vector<mutation> muts;
+
+    std::vector<mutation> operator()(mutation&& m) {
+        muts.emplace_back(std::move(m));
+        return std::move(muts);
+    }
+};
+
 struct qualified_name {
    sstring keyspace_name;
    sstring table_name;
@@ -547,6 +556,14 @@ read_schema_partition_for_table(distributed<service::storage_proxy>& proxy, sche
    return query_partition_mutation(proxy.local(), std::move(schema), std::move(cmd), std::move(keyspace_key));
 }

+future<mutation>
+read_keyspace_mutation(distributed<service::storage_proxy>& proxy, const sstring& keyspace_name) {
+    schema_ptr s = keyspaces();
+    auto key = partition_key::from_singular(*s, keyspace_name);
+    auto cmd = make_lw_shared<query::read_command>(s->id(), s->version(), query::full_slice);
+    return query_partition_mutation(proxy.local(), std::move(s), std::move(cmd), std::move(key));
+}
+
 static semaphore the_merge_lock {1};

 future<> merge_lock() {
@@ -832,39 +849,6 @@ static inline void collect_types(std::set<sstring>& keys, schema_result& result,
    }
 }

-static inline void ensure_type_is_unused(distributed<service::storage_proxy>& proxy, user_type type)
-{
-	// We don't want to drop a type unless it's not used anymore (mainly because
-    // if someone drops a type and recreates one with the same name but different
-    // definition with the previous name still in use, things can get messy).
-    // We have two places to check: 1) other user type that can nest the one
-    // we drop and 2) existing tables referencing the type (maybe in a nested
-    // way).
-
-    auto&& keyspace = type->_keyspace;
-    auto&& name = type->_name;
-    auto&& db = proxy.local().get_db().local();
-    auto&& ks = db.find_keyspace(type->_keyspace);
-
-    for (auto&& ut : ks.metadata()->user_types()->get_all_types() | boost::adaptors::map_values) {
-        if (ut->_keyspace == keyspace && ut->_name == name) {
-            continue;
-        }
-
-        if (ut->references_user_type(keyspace, name)) {
-            throw exceptions::invalid_request_exception(sprint("Cannot drop user type %s.%s as it is still used by user type %s", keyspace, type->get_name_as_string(), ut->get_name_as_string()));
-        }
-    }
-
-    for (auto&& cfm : ks.metadata()->cf_meta_data() | boost::adaptors::map_values) {
-        for (auto&& col : cfm->all_columns() | boost::adaptors::map_values) {
-            if (col->type->references_user_type(keyspace, name)) {
-                throw exceptions::invalid_request_exception(sprint("Cannot drop user type %s.%s as it is still used by table %s.%s", keyspace, type->get_name_as_string(), cfm->ks_name(), cfm->cf_name()));
-            }
-        }
-    }
-}
-
 // see the comments for merge_keyspaces()
 static void merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
 {
@@ -898,10 +882,6 @@ static void merge_types(distributed<service::storage_proxy>& proxy, schema_resul
        }
    }

-    for (auto&& ut : dropped) {
-        ensure_type_is_unused(proxy, ut);
-    }
-
    proxy.local().get_db().invoke_on_all([&created, &dropped, &altered] (database& db) {
        return seastar::async([&] {
            for (auto&& type : created) {
@@ -1182,19 +1162,18 @@ void add_type_to_schema_mutation(user_type type, api::timestamp_type timestamp,
    mutations.emplace_back(std::move(m));
 }

-std::vector<mutation> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    add_type_to_schema_mutation(type, timestamp, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

-std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
-
+    std::vector<mutation> mutations;
    schema_ptr s = usertypes();
    auto pkey = partition_key::from_singular(*s, type->_keyspace);
    auto ckey = clustering_key::from_singular(*s, type->get_name_as_string());
@@ -1202,19 +1181,21 @@ std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata>
    m.partition().apply_delete(*s, ckey, tombstone(timestamp, gc_clock::now()));
    mutations.emplace_back(std::move(m));

-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 /*
 * Table metadata serialization/deserialization.
 */

-std::vector<mutation> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    add_table_or_view_to_schema_mutation(table, timestamp, true, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_type timestamp, bool with_columns_and_triggers)
@@ -1347,15 +1328,13 @@ static void make_update_columns_mutations(schema_ptr old_table,
    mutations.emplace_back(std::move(columns_mutation));
 }

-std::vector<mutation> make_update_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
+future<std::vector<mutation>> make_update_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
    schema_ptr old_table,
    schema_ptr new_table,
    api::timestamp_type timestamp,
    bool from_thrift)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
-
+    std::vector<mutation> mutations;
    add_table_or_view_to_schema_mutation(new_table, timestamp, false, mutations);

    make_update_columns_mutations(std::move(old_table), std::move(new_table), timestamp, from_thrift, mutations);
@@ -1373,7 +1352,8 @@ std::vector<mutation> make_update_table_mutations(lw_shared_ptr<keyspace_metadat
            addTriggerToSchemaMutation(newTable, trigger, timestamp, mutation);

 #endif
-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 static void make_drop_table_or_view_mutations(schema_ptr schema_table,
@@ -1390,10 +1370,9 @@ static void make_drop_table_or_view_mutations(schema_ptr schema_table,
    }
 }

-std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    make_drop_table_or_view_mutations(columnfamilies(), std::move(table), timestamp, mutations);

 #if 0
@@ -1405,7 +1384,8 @@ std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata>
    for (String indexName : Keyspace.open(keyspace.name).getColumnFamilyStore(table.cfName).getBuiltIndexes())
        indexCells.addTombstone(indexCells.getComparator().makeCellName(indexName), ldt, timestamp);
 #endif
-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 static future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy, const qualified_name& table, schema_ptr s)
@@ -1899,37 +1879,39 @@ schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timesta
    return s->is_view() ? make_view_mutations(view_ptr(s), timestamp, with_columns) : make_table_mutations(s, timestamp, with_columns);
 }

-std::vector<mutation> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp)
+future<std::vector<mutation>> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    // And also the serialized base table.
    auto base = keyspace->cf_meta_data().at(view->view_info()->base_name());
    add_table_or_view_to_schema_mutation(base, timestamp, true, mutations);
    add_table_or_view_to_schema_mutation(view, timestamp, true, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

-std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
+future<std::vector<mutation>> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace,
                                                 view_ptr old_view,
                                                 view_ptr new_view,
                                                 api::timestamp_type timestamp)
 {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+    std::vector<mutation> mutations;
    // And also the serialized base table.
    auto base = keyspace->cf_meta_data().at(new_view->view_info()->base_name());
    add_table_or_view_to_schema_mutation(base, timestamp, true, mutations);
    add_table_or_view_to_schema_mutation(new_view, timestamp, false, mutations);
    make_update_columns_mutations(old_view, new_view, timestamp, false, mutations);
-    return mutations;
+
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

-std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp) {
-    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-    auto mutations = make_create_keyspace_mutations(keyspace, timestamp, false);
+future<std::vector<mutation>> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp) {
+    std::vector<mutation> mutations;
    make_drop_table_or_view_mutations(views(), view, timestamp, mutations);
-    return mutations;
+    // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+    return read_keyspace_mutation(service::get_storage_proxy(), keyspace->name()).then(push_back_and_return{std::move(mutations)});
 }

 #if 0
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -80,6 +80,7 @@ future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<ser

 future<schema_result_value_type>
 read_schema_partition_for_keyspace(distributed<service::storage_proxy>& proxy, const sstring& schema_table_name, const sstring& keyspace_name);
+future<mutation> read_keyspace_mutation(distributed<service::storage_proxy>&, const sstring& keyspace_name);

 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations);

@@ -95,17 +96,17 @@ std::vector<mutation> make_drop_keyspace_mutations(lw_shared_ptr<keyspace_metada

 lw_shared_ptr<keyspace_metadata> create_keyspace_from_schema_partition(const schema_result_value_type& partition);

-std::vector<mutation> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_create_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);

 std::vector<user_type> create_types_from_schema_partition(const schema_result_value_type& result);

-std::vector<mutation> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_drop_type_mutations(lw_shared_ptr<keyspace_metadata> keyspace, user_type type, api::timestamp_type timestamp);

 void add_type_to_schema_mutation(user_type type, api::timestamp_type timestamp, std::vector<mutation>& mutations);

-std::vector<mutation> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_create_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);

-std::vector<mutation> make_update_table_mutations(
+future<std::vector<mutation>> make_update_table_mutations(
    lw_shared_ptr<keyspace_metadata> keyspace,
    schema_ptr old_table,
    schema_ptr new_table,
@@ -114,7 +115,7 @@ std::vector<mutation> make_update_table_mutations(

 future<std::map<sstring, schema_ptr>> create_tables_from_tables_partition(distributed<service::storage_proxy>& proxy, const schema_result::mapped_type& result);

-std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);

 future<schema_ptr> create_table_from_name(distributed<service::storage_proxy>& proxy, const sstring& keyspace, const sstring& table);

@@ -149,11 +150,11 @@ schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timesta

 void add_table_or_view_to_schema_mutation(schema_ptr view, api::timestamp_type timestamp, bool with_columns, std::vector<mutation>& mutations);

-std::vector<mutation> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_create_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);

-std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr old_view, view_ptr new_view, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_update_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr old_view, view_ptr new_view, api::timestamp_type timestamp);

-std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);
+future<std::vector<mutation>> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);

 sstring serialize_kind(column_kind kind);
 column_kind deserialize_kind(sstring kind);
--- a/dht/byte_ordered_partitioner.cc
+++ b/dht/byte_ordered_partitioner.cc
@@ -21,6 +21,7 @@

 #include "byte_ordered_partitioner.hh"
 #include "utils/class_registrator.hh"
+#include "utils/div_ceil.hh"
 #include <boost/multiprecision/cpp_int.hpp>
 #include <boost/multiprecision/cpp_dec_float.hpp>

@@ -162,22 +163,17 @@ byte_ordered_partitioner::shard_of(const token& t) const {
 }

 token
-byte_ordered_partitioner::token_for_next_shard(const token& t) const {
+byte_ordered_partitioner::token_for_next_shard(const token& t, shard_id shard, unsigned spans) const {
    switch (t._kind) {
-    case token::kind::before_all_keys:
-        return token_for_next_shard(token(token::kind::key, managed_bytes{int8_t(0)}));
    case token::kind::after_all_keys:
        return maximum_token();
+    case token::kind::before_all_keys:
    case token::kind::key:
-        auto s = shard_of(t) + 1;
-        if (s == _shard_count) {
+        auto orig = shard_of(t);
+        if (shard <= orig || spans != 1) {
            return maximum_token();
        }
-        auto e = (s << 8) / _shard_count;
-        // Division truncates; adjust
-        while (((e * _shard_count) >> 8) != s) {
-            ++e;
-        }
+        auto e = div_ceil(shard << 8, _shard_count);
        return token(token::kind::key, managed_bytes({int8_t(e)}));
    }
    assert(0);
--- a/dht/byte_ordered_partitioner.hh
+++ b/dht/byte_ordered_partitioner.hh
@@ -29,10 +29,9 @@
 namespace dht {

 class byte_ordered_partitioner final : public i_partitioner {
-    unsigned _shard_count;
 public:
-    byte_ordered_partitioner(unsigned shard_count = smp::count, unsigned ignore_msb = 0) : _shard_count(shard_count) {}
-    virtual const sstring name() { return "org.apache.cassandra.dht.ByteOrderedPartitioner"; }
+    byte_ordered_partitioner(unsigned shard_count = smp::count, unsigned ignore_msb = 0) : i_partitioner(shard_count) {}
+    virtual const sstring name() const { return "org.apache.cassandra.dht.ByteOrderedPartitioner"; }
    virtual token get_token(const schema& s, partition_key_view key) override {
        auto&& legacy = key.legacy_form(s);
        return token(token::kind::key, bytes(legacy.begin(), legacy.end()));
@@ -75,7 +74,7 @@ public:
        }
    }
    virtual unsigned shard_of(const token& t) const override;
-    virtual token token_for_next_shard(const token& t) const override;
+    virtual token token_for_next_shard(const token& t, shard_id shard, unsigned spans) const override;
 };

 }
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -25,6 +25,7 @@
 #include "utils/class_registrator.hh"
 #include "types.hh"
 #include "utils/murmur_hash.hh"
+#include "utils/div_ceil.hh"
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/irange.hpp>
 #include <boost/range/adaptor/transformed.hpp>
@@ -160,7 +161,7 @@ std::ostream& operator<<(std::ostream& out, const decorated_key& dk) {
 }

 // FIXME: make it per-keyspace
-std::unique_ptr<i_partitioner> default_partitioner { new murmur3_partitioner };
+std::unique_ptr<i_partitioner> default_partitioner;

 void set_global_partitioner(const sstring& class_name, unsigned ignore_msb)
 {
@@ -176,6 +177,9 @@ void set_global_partitioner(const sstring& class_name, unsigned ignore_msb)

 i_partitioner&
 global_partitioner() {
+    if (!default_partitioner) {
+        default_partitioner = std::make_unique<murmur3_partitioner>(smp::count, 12);
+    }
    return *default_partitioner;
 }

@@ -256,8 +260,9 @@ ring_position_range_sharder::next(const schema& s) {
    if (_done) {
        return {};
    }
-    auto shard = _range.start() ? shard_of(_range.start()->value().token()) : global_partitioner().shard_of_minimum_token();
-    auto shard_boundary_token = _partitioner.token_for_next_shard(_range.start() ? _range.start()->value().token() : minimum_token());
+    auto shard = _range.start() ? _partitioner.shard_of(_range.start()->value().token()) : _partitioner.shard_of_minimum_token();
+    auto next_shard = shard + 1 < _partitioner.shard_count() ? shard + 1 : 0;
+    auto shard_boundary_token = _partitioner.token_for_next_shard(_range.start() ? _range.start()->value().token() : minimum_token(), next_shard);
    auto shard_boundary = ring_position::starting_at(shard_boundary_token);
    if ((!_range.end() || shard_boundary.less_compare(s, _range.end()->value()))
            && shard_boundary_token != maximum_token()) {
@@ -273,6 +278,96 @@ ring_position_range_sharder::next(const schema& s) {
    return ring_position_range_and_shard{std::move(_range), shard};
 }

+
+ring_position_exponential_sharder::ring_position_exponential_sharder(const i_partitioner& partitioner, partition_range pr)
+        : _partitioner(partitioner)
+        , _range(std::move(pr))
+        , _last_ends(_partitioner.shard_count()) {
+    if (_range.start()) {
+        _first_shard = _next_shard = _partitioner.shard_of(_range.start()->value().token());
+    }
+}
+
+ring_position_exponential_sharder::ring_position_exponential_sharder(partition_range pr)
+        : ring_position_exponential_sharder(global_partitioner(), std::move(pr)) {
+}
+
+stdx::optional<ring_position_exponential_sharder_result>
+ring_position_exponential_sharder::next(const schema& s) {
+    auto ret = ring_position_exponential_sharder_result{};
+    ret.per_shard_ranges.reserve(std::min(_spans_per_iteration, _partitioner.shard_count()));
+    ret.inorder = _spans_per_iteration <= _partitioner.shard_count();
+    unsigned spans_to_go = _spans_per_iteration;
+    auto cmp = ring_position_comparator(s);
+    auto spans_per_shard = _spans_per_iteration / _partitioner.shard_count();
+    auto shards_with_extra_span = _spans_per_iteration % _partitioner.shard_count();
+    auto first_shard = _next_shard;
+    _next_shard = (_next_shard + _spans_per_iteration) % _partitioner.shard_count();
+    for (auto i : boost::irange(0u, std::min(_partitioner.shard_count(), _spans_per_iteration))) {
+        auto shard = (first_shard + i) % _partitioner.shard_count();
+        if (_last_ends[shard] && *_last_ends[shard] == maximum_token()) {
+            continue;
+        }
+        range_bound<ring_position> this_shard_start = [&] {
+            if (_last_ends[shard]) {
+                return range_bound<ring_position>(ring_position::starting_at(*_last_ends[shard]));
+            } else {
+                return _range.start().value_or(range_bound<ring_position>(ring_position::starting_at(minimum_token())));
+            }
+        }();
+        // token_for_next_span() may give us the wrong boundary on the first pass, so add an extra span:
+        auto extra_span = !_last_ends[shard] && shard != _first_shard;
+        auto spans = spans_per_shard + unsigned(i < shards_with_extra_span);
+        auto boundary = _partitioner.token_for_next_shard(this_shard_start.value().token(), shard, spans + extra_span);
+        auto proposed_range = partition_range(this_shard_start, range_bound<ring_position>(ring_position::starting_at(boundary), false));
+        auto intersection = _range.intersection(proposed_range, cmp);
+        if (!intersection) {
+            continue;
+        }
+        spans_to_go -= spans;
+        auto this_shard_result = ring_position_range_and_shard{std::move(*intersection), shard};
+        _last_ends[shard] = boundary;
+        ret.per_shard_ranges.push_back(std::move(this_shard_result));
+    }
+    if (ret.per_shard_ranges.empty()) {
+        return stdx::nullopt;
+    }
+    _spans_per_iteration *= 2;
+    return stdx::make_optional(std::move(ret));
+}
+
+
+ring_position_exponential_vector_sharder::ring_position_exponential_vector_sharder(const std::vector<nonwrapping_range<ring_position>>& ranges)
+        : _ranges(std::begin(ranges), std::end(ranges)) {
+    if (!_ranges.empty()) {
+        _current_sharder.emplace(_ranges.front());
+        _ranges.pop_front();
+        ++_element;
+    }
+}
+
+stdx::optional<ring_position_exponential_vector_sharder_result>
+ring_position_exponential_vector_sharder::next(const schema& s) {
+    if (!_current_sharder) {
+        return stdx::nullopt;
+    }
+    while (true) {  // yuch
+        auto ret = _current_sharder->next(s);
+        if (ret) {
+            auto augmented = ring_position_exponential_vector_sharder_result{std::move(*ret), _element};
+            return stdx::make_optional(std::move(augmented));
+        }
+        if (_ranges.empty()) {
+            _current_sharder = stdx::nullopt;
+            return stdx::nullopt;
+        }
+        _current_sharder.emplace(_ranges.front());
+        _ranges.pop_front();
+        ++_element;
+    }
+}
+
+
 ring_position_range_vector_sharder::ring_position_range_vector_sharder(dht::partition_range_vector ranges)
        : _ranges(std::move(ranges))
        , _current_range(_ranges.begin()) {
@@ -300,6 +395,33 @@ int ring_position_comparator::operator()(const ring_position& lh, const ring_pos
    return lh.tri_compare(s, rh);
 }

+std::vector<partition_range>
+split_range_to_single_shard(const i_partitioner& partitioner, const schema& s, const partition_range& pr, shard_id shard) {
+    auto cmp = ring_position_comparator(s);
+    auto ret = std::vector<partition_range>();
+    auto next_shard = shard + 1 == partitioner.shard_count() ? 0 : shard + 1;
+    auto start_token = pr.start() ? pr.start()->value().token() : minimum_token();
+    auto start_shard = partitioner.shard_of(start_token);
+    auto start_boundary = start_shard == shard ? pr.start() : range_bound<ring_position>(ring_position::starting_at(partitioner.token_for_next_shard(start_token, shard)));
+    while (pr.overlaps(partition_range(start_boundary, {}), cmp)
+            && !(start_boundary && start_boundary->value().token() == maximum_token())) {
+        auto end_token = partitioner.token_for_next_shard(start_token, next_shard);
+        auto candidate = partition_range(std::move(start_boundary), range_bound<ring_position>(ring_position::starting_at(end_token), false));
+        auto intersection = pr.intersection(std::move(candidate), cmp);
+        if (intersection) {
+            ret.push_back(std::move(*intersection));
+        }
+        start_token = partitioner.token_for_next_shard(end_token, shard);
+        start_boundary = range_bound<ring_position>(ring_position::starting_at(start_token));
+    }
+    return ret;
+}
+
+std::vector<partition_range>
+split_range_to_single_shard(const schema& s, const partition_range& pr, shard_id shard) {
+    return split_range_to_single_shard(global_partitioner(), s, pr, shard);
+}
+
 int token_comparator::operator()(const token& t1, const token& t2) const {
    return tri_compare(t1, t2);
 }
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -180,7 +180,10 @@ public:
 using decorated_key_opt = std::experimental::optional<decorated_key>;

 class i_partitioner {
+protected:
+    unsigned _shard_count;
 public:
+    explicit i_partitioner(unsigned shard_count) : _shard_count(shard_count) {}
    virtual ~i_partitioner() {}

    /**
@@ -272,7 +275,7 @@ public:
    /**
     * @return name of partitioner.
     */
-    virtual const sstring name() = 0;
+    virtual const sstring name() const = 0;

    /**
     * Calculates the shard that handles a particular token.
@@ -280,9 +283,17 @@ public:
    virtual unsigned shard_of(const token& t) const = 0;

    /**
-     * Gets the first token greater than `t` that is not in the same shard as `t`.
+     * Gets the first token greater than `t` that is in shard `shard`, and is a shard boundary (its first token).
+     *
+     * If the `spans` parameter is greater than zero, the result is the same as if the function
+     * is called `spans` times, each time applied to its return value, but efficiently. This allows
+     * selecting ranges that include multiple round trips around the 0..smp::count-1 shard span:
+     *
+     *     token_for_next_shard(t, shard, spans) == token_for_next_shard(token_for_shard(t, shard, 1), spans - 1)
+     *
+     * On overflow, maximum_token() is returned.
     */
-    virtual token token_for_next_shard(const token& t) const = 0;
+    virtual token token_for_next_shard(const token& t, shard_id shard, unsigned spans = 1) const = 0;

    /**
     * Gets the first shard of the minimum token.
@@ -315,6 +326,13 @@ public:
        return tri_compare(t1, t2) < 0;
    }

+    /**
+     * @return number of shards configured for this partitioner
+     */
+    unsigned shard_count() const {
+        return _shard_count;
+    }
+
    friend bool operator==(const token& t1, const token& t2);
    friend bool operator<(const token& t1, const token& t2);
    friend int tri_compare(const token& t1, const token& t2);
@@ -476,6 +494,44 @@ struct ring_position_range_and_shard_and_element : ring_position_range_and_shard
    unsigned element;
 };

+struct ring_position_exponential_sharder_result {
+    std::vector<ring_position_range_and_shard> per_shard_ranges;
+    bool inorder = true;
+};
+
+// given a ring_position range, generates exponentially increasing
+// sets per-shard sub-ranges
+class ring_position_exponential_sharder {
+    const i_partitioner& _partitioner;
+    partition_range _range;
+    unsigned _spans_per_iteration = 1;
+    unsigned _first_shard = 0;
+    unsigned _next_shard = 0;
+    std::vector<stdx::optional<token>> _last_ends; // index = shard
+public:
+    explicit ring_position_exponential_sharder(partition_range pr);
+    explicit ring_position_exponential_sharder(const i_partitioner& partitioner, partition_range pr);
+    stdx::optional<ring_position_exponential_sharder_result> next(const schema& s);
+};
+
+struct ring_position_exponential_vector_sharder_result : ring_position_exponential_sharder_result {
+    ring_position_exponential_vector_sharder_result(ring_position_exponential_sharder_result rpesr, unsigned element)
+            : ring_position_exponential_sharder_result(std::move(rpesr)), element(element) {}
+    unsigned element; // range within vector from which this result came
+};
+
+
+// given a vector of sorted, disjoint ring_position ranges, generates exponentially increasing
+// sets per-shard sub-ranges.  May be non-exponential when moving from one ring position range to another.
+class ring_position_exponential_vector_sharder {
+    std::deque<nonwrapping_range<ring_position>> _ranges;
+    stdx::optional<ring_position_exponential_sharder> _current_sharder;
+    unsigned _element = 0;
+public:
+    explicit ring_position_exponential_vector_sharder(const std::vector<nonwrapping_range<ring_position>>& ranges);
+    stdx::optional<ring_position_exponential_vector_sharder_result> next(const schema& s);
+};
+
 class ring_position_range_vector_sharder {
    using vec_type = dht::partition_range_vector;
    vec_type _ranges;
@@ -504,6 +560,10 @@ split_range_to_shards(dht::partition_range pr, const schema& s);
 std::map<unsigned, dht::partition_range_vector>
 split_ranges_to_shards(const dht::token_range_vector& ranges, const schema& s);

+// Intersect a partition_range with a shard and return the the resulting sub-ranges, in sorted order
+std::vector<partition_range> split_range_to_single_shard(const schema& s, const dht::partition_range& pr, shard_id shard);
+std::vector<partition_range> split_range_to_single_shard(const i_partitioner& partitioner, const schema& s, const dht::partition_range& pr, shard_id shard);
+
 } // dht

 namespace std {
--- a/dht/murmur3_partitioner.cc
+++ b/dht/murmur3_partitioner.cc
@@ -24,9 +24,40 @@
 #include "sstables/key.hh"
 #include "utils/class_registrator.hh"
 #include <boost/lexical_cast.hpp>
+#include <boost/range/irange.hpp>

 namespace dht {

+inline
+unsigned
+murmur3_partitioner::zero_based_shard_of(uint64_t token, unsigned shards, unsigned sharding_ignore_msb_bits) {
+    // This is the master function, the inverses have to match it wrt. rounding errors.
+    token <<= sharding_ignore_msb_bits;
+    // Treat "token" as a fraction in the interval [0, 1); compute:
+    //    shard = floor((0.token) * shards)
+    return (uint128_t(token) * shards) >> 64;
+}
+
+std::vector<uint64_t>
+murmur3_partitioner::init_zero_based_shard_start(unsigned shards, unsigned sharding_ignore_msb_bits) {
+    // computes the inverse of zero_based_shard_of(). ret[s] will return the smallest token that belongs to s
+    if (shards == 1) {
+        // Avoid the while loops below getting confused finding the "edge" between two nonexistent shards
+        return std::vector<uint64_t>(1, uint64_t(0));
+    }
+    auto ret = std::vector<uint64_t>(shards);
+    for (auto s : boost::irange<unsigned>(0, shards)) {
+        uint64_t token = (uint128_t(s) << 64) / shards;
+        token >>= sharding_ignore_msb_bits;   // leftmost bits are ignored by zero_based_shard_of
+        // token is the start of the next shard, and can be slightly before due to rounding errors; adjust
+        while (zero_based_shard_of(token, shards, sharding_ignore_msb_bits) != s) {
+            ++token;
+        }
+        ret[s] = token;
+    }
+    return ret;
+}
+
 inline
 int64_t
 murmur3_partitioner::normalize(int64_t in) {
@@ -88,6 +119,16 @@ inline int64_t long_token(const token& t) {
    return net::ntoh(*lp);
 }

+uint64_t
+murmur3_partitioner::unbias(const token& t) const {
+    return uint64_t(long_token(t)) + uint64_t(std::numeric_limits<int64_t>::min());
+}
+
+token
+murmur3_partitioner::bias(uint64_t n) const {
+    return get_token(n - uint64_t(std::numeric_limits<int64_t>::min()));
+}
+
 sstring murmur3_partitioner::to_sstring(const token& t) const {
    return ::to_sstring(long_token(t));
 }
@@ -210,46 +251,43 @@ murmur3_partitioner::shard_of(const token& t) const {
        case token::kind::after_all_keys:
            return _shard_count - 1;
        case token::kind::key:
-            int64_t l = long_token(t);
-            // treat l as a fraction between 0 and 1 and use 128-bit arithmetic to
-            // divide that range evenly among shards:
-            uint64_t adjusted = uint64_t(l) + uint64_t(std::numeric_limits<int64_t>::min());
-            adjusted <<= _sharding_ignore_msb_bits;
-            return (__int128(adjusted) * _shard_count) >> 64;
+            uint64_t adjusted = unbias(t);
+            return zero_based_shard_of(adjusted, _shard_count, _sharding_ignore_msb_bits);
    }
    assert(0);
 }

 token
-murmur3_partitioner::token_for_next_shard(const token& t) const {
+murmur3_partitioner::token_for_next_shard(const token& t, shard_id shard, unsigned spans) const {
+    uint64_t n = 0;
    switch (t._kind) {
        case token::kind::before_all_keys:
-            return token_for_next_shard(get_token(std::numeric_limits<int64_t>::min() + 1));
+            break;
        case token::kind::after_all_keys:
            return maximum_token();
        case token::kind::key:
-            if (long_token(t) == std::numeric_limits<int64_t>::min()) {
-                return token_for_next_shard(get_token(std::numeric_limits<int64_t>::min() + 1));
-            }
-            using uint128 = unsigned __int128;
-            auto s = shard_of(t) + 1;
-            s = s < _shard_count ? s : 0;
-            int64_t l = long_token(t);
-            // treat l as a fraction between 0 and 1 and use 128-bit arithmetic to
-            // divide that range evenly among shards:
-            uint64_t adjusted = uint64_t(l) + uint64_t(std::numeric_limits<int64_t>::min());
-            auto mul = align_up(uint128(adjusted) * _shard_count + 1, uint128(1) << (64 - _sharding_ignore_msb_bits));
-            if (mul >> 64 == _shard_count) {
-                return maximum_token();
-            }
-            uint64_t e = mul / _shard_count;
-            while (((uint128(e << _sharding_ignore_msb_bits) * _shard_count) >> 64) != s) {
-                // division will round down, so correct for it
-                ++e;
-            }
-            return get_token(e + uint64_t(std::numeric_limits<int64_t>::min()));
+            n = unbias(t);
+            break;
    }
-    assert(0);
+    auto s = zero_based_shard_of(n, _shard_count, _sharding_ignore_msb_bits);
+
+    if (!_sharding_ignore_msb_bits) {
+        // This ought to be the same as the else branch, but avoids shifts by 64
+        n = _shard_start[shard];
+        if (spans > 1 || shard <= s) {
+            return maximum_token();
+        }
+    } else {
+        auto left_part = n >> (64 - _sharding_ignore_msb_bits);
+        left_part += spans - unsigned(shard > s);
+        if (left_part >= (1u << _sharding_ignore_msb_bits)) {
+            return maximum_token();
+        }
+        left_part <<= (64 - _sharding_ignore_msb_bits);
+        auto right_part = _shard_start[shard];
+        n = left_part | right_part;
+    }
+    return bias(n);
 }


--- a/dht/murmur3_partitioner.hh
+++ b/dht/murmur3_partitioner.hh
@@ -23,20 +23,21 @@

 #include "i_partitioner.hh"
 #include "bytes.hh"
+#include <vector>

 namespace dht {

 class murmur3_partitioner final : public i_partitioner {
-    unsigned _shard_count;
    unsigned _sharding_ignore_msb_bits;
+    std::vector<uint64_t> _shard_start = init_zero_based_shard_start(_shard_count, _sharding_ignore_msb_bits);
 public:
    murmur3_partitioner(unsigned shard_count = smp::count, unsigned sharding_ignore_msb_bits = 0)
-            : _shard_count(shard_count)
+            : i_partitioner(shard_count)
            // if one shard, ignore sharding_ignore_msb_bits as they will just cause needless
            // range breaks
            , _sharding_ignore_msb_bits(shard_count > 1 ? sharding_ignore_msb_bits : 0) {
    }
-    virtual const sstring name() { return "org.apache.cassandra.dht.Murmur3Partitioner"; }
+    virtual const sstring name() const { return "org.apache.cassandra.dht.Murmur3Partitioner"; }
    virtual token get_token(const schema& s, partition_key_view key) override;
    virtual token get_token(const sstables::key_view& key) override;
    virtual token get_random_token() override;
@@ -50,11 +51,16 @@ public:
    virtual dht::token from_bytes(bytes_view bytes) const override;

    virtual unsigned shard_of(const token& t) const override;
-    virtual token token_for_next_shard(const token& t) const override;
+    virtual token token_for_next_shard(const token& t, shard_id shard, unsigned spans) const override;
 private:
+    using uint128_t = unsigned __int128;
    static int64_t normalize(int64_t in);
    token get_token(bytes_view key);
    token get_token(uint64_t value) const;
+    token bias(uint64_t value) const;      // translate from a zero-baed range
+    uint64_t unbias(const token& t) const; // translate to a zero-baed range
+    static unsigned zero_based_shard_of(uint64_t zero_based_token, unsigned shards, unsigned sharding_ignore_msb_bits);
+    static std::vector<uint64_t> init_zero_based_shard_start(unsigned shards, unsigned sharding_ignore_msb_bits);
 };


--- a/dht/random_partitioner.cc
+++ b/dht/random_partitioner.cc
@@ -22,6 +22,7 @@
 #include "md5_hasher.hh"
 #include "random_partitioner.hh"
 #include "utils/class_registrator.hh"
+#include "utils/div_ceil.hh"
 #include <boost/multiprecision/cpp_int.hpp>

 namespace dht {
@@ -222,21 +223,20 @@ unsigned random_partitioner::shard_of(const token& t) const {
 }

 token
-random_partitioner::token_for_next_shard(const token& t) const {
+random_partitioner::token_for_next_shard(const token& t, shard_id shard, unsigned spans) const {
+    if (_shard_count == 1) {
+        return maximum_token();
+    }
    switch (t._kind) {
        case token::kind::after_all_keys:
            return maximum_token();
        case token::kind::before_all_keys:
        case token::kind::key:
-            auto s = shard_of(t) + 1;
-            if (s == _shard_count) {
+            auto orig = shard_of(t);
+            if (shard <= orig || spans != 1) {
                return maximum_token();
            }
-            auto t = (boost::multiprecision::uint256_t(s) << 127) / _shard_count;
-            // division truncates, so adjust
-            while (((t * _shard_count) >> 127) != s) {
-                ++t;
-            }
+            auto t = div_ceil(boost::multiprecision::uint256_t(shard) << 127, _shard_count);
            return cppint_to_token(t.convert_to<boost::multiprecision::uint128_t>());
    }
    assert(0);
--- a/dht/random_partitioner.hh
+++ b/dht/random_partitioner.hh
@@ -29,10 +29,9 @@
 namespace dht {

 class random_partitioner final : public i_partitioner {
-    unsigned _shard_count;
 public:
-    random_partitioner(unsigned shard_count = smp::count, unsigned ignore_msb = 0) : _shard_count(shard_count) {}
-    virtual const sstring name() { return "org.apache.cassandra.dht.RandomPartitioner"; }
+    random_partitioner(unsigned shard_count = smp::count, unsigned ignore_msb = 0) : i_partitioner(shard_count) {}
+    virtual const sstring name() const { return "org.apache.cassandra.dht.RandomPartitioner"; }
    virtual token get_token(const schema& s, partition_key_view key) override;
    virtual token get_token(const sstables::key_view& key) override;
    virtual token get_random_token() override;
@@ -46,7 +45,7 @@ public:
    virtual dht::token from_sstring(const sstring& t) const override;
    virtual dht::token from_bytes(bytes_view bytes) const override;
    virtual unsigned shard_of(const token& t) const override;
-    virtual token token_for_next_shard(const token& t) const override;
+    virtual token token_for_next_shard(const token& t, shard_id shard, unsigned spans) const override;
 private:
    token get_token(bytes data);
 };
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/modprobe.d/scylla-raid0.conf
+++ b/dist/common/modprobe.d/scylla-raid0.conf
@@ -0,0 +1 @@
+options raid0 devices_discard_performance=Y
--- a/dist/common/scripts/node_exporter_install
+++ b/dist/common/scripts/node_exporter_install
@@ -27,11 +27,11 @@ if [ -f /usr/bin/node_exporter ]; then
    exit 1
 fi

-version=0.12.0
+version=0.14.0
 dir=/usr/lib/scylla/Prometheus/node_exporter
 mkdir -p $dir
 cd $dir
-curl -L https://github.com/prometheus/node_exporter/releases/download/$version/node_exporter-$version.linux-amd64.tar.gz -o $dir/node_exporter-$version.linux-amd64.tar.gz
+curl -L https://github.com/prometheus/node_exporter/releases/download/v$version/node_exporter-$version.linux-amd64.tar.gz -o $dir/node_exporter-$version.linux-amd64.tar.gz
 tar -xvzf $dir/node_exporter-$version.linux-amd64.tar.gz
 rm $dir/node_exporter-$version.linux-amd64.tar.gz
 ln -s $dir/node_exporter-$version.linux-amd64/node_exporter /usr/bin
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -5,15 +5,20 @@
 . /usr/lib/scylla/scylla_lib.sh

 print_usage() {
-    echo "scylla-raid-setup --disks /dev/hda,/dev/hdb... --raiddev /dev/md0 --update-fstab"
+    echo "scylla-raid-setup --disks /dev/hda,/dev/hdb... --raiddev /dev/md0 --update-fstab --root /var/lib/scylla --volume-role [all|data|commitlog]"
    echo "  --disks	specify disks for RAID"
    echo "  --raiddev	MD device name for RAID"
    echo "  --update-fstab update /etc/fstab for RAID"
+    echo "  --root specify the root of the tree"
+    echo "  --volume-role specify how will this device be used (data, commitlog, or all)"
    exit 1
 }

 RAID=/dev/md0
 FSTAB=0
+ROOT=/var/lib/scylla
+ROLE="all"
+
 while [ $# -gt 0 ]; do
    case "$1" in
        "--disks")
@@ -29,12 +34,37 @@ while [ $# -gt 0 ]; do
            FSTAB=1
            shift 1
            ;;
+        "--root")
+            ROOT="$2"
+            shift 2
+            ;;
+        "--volume-role")
+            ROLE="$2"
+            shift 2
+            ;;
        *)
            print_usage
            ;;
    esac
 done

+ROOT=${ROOT%/}
+case "$ROLE" in
+    "all")
+        MOUNT_AT=$ROOT
+        ;;
+    "data")
+        MOUNT_AT="$ROOT/data"
+        ;;
+    "commitlog")
+        MOUNT_AT="$ROOT/commitlog"
+        ;;
+    *)
+        echo "Invalid role specified ($ROLE)"
+        print_usage
+        ;;
+esac
+
 if [ "$DISKS" = "" ]; then
    print_usage
 fi
@@ -51,8 +81,8 @@ if [ -e $RAID ]; then
    echo "$RAID is already using"
    exit 1
 fi
-if [ "`mount|grep /var/lib/scylla`" != "" ]; then
-    echo "/var/lib/scylla is already mounted"
+if mountpoint -q $MOUNT_AT; then
+    echo "$MOUNT_AT is already mounted"
    exit 1
 fi

@@ -61,18 +91,32 @@ if is_debian_variant; then
 else
    yum -y install mdadm xfsprogs
 fi
-mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
-mkfs.xfs $RAID -f
-echo "DEVICE $DISKS" > /etc/mdadm.conf
-mdadm --detail --scan >> /etc/mdadm.conf
+if [ "$ID" = "ubuntu" ] && [ "$VERSION_ID" = "14.04" ]; then
+    mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
+    mkfs.xfs $RAID -f
+else
+    for dsk in $DISKS; do
+        blkdiscard $dsk &
+    done
+    wait
+    mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
+    mkfs.xfs $RAID -f -K
+fi
+mdadm --detail --scan > /etc/mdadm.conf
+
+mkdir -p "$MOUNT_AT"
+mount -t xfs -o noatime $RAID "$MOUNT_AT"
+
+# create this unconditionally so we are more robust about ordering
+# if the script is run multiple times. But must do after mount in case
+# we are mounting the root
+mkdir -p "$ROOT/data"
+mkdir -p "$ROOT/commitlog"
+mkdir -p "$ROOT/coredump"
+chown scylla:scylla "$ROOT"
+chown scylla:scylla "$ROOT"/*
+
 if [ $FSTAB -ne 0 ]; then
    UUID=`blkid $RAID | awk '{print $2}'`
-    echo "$UUID /var/lib/scylla xfs noatime 0 0" >> /etc/fstab
+    echo "$UUID $MOUNT_AT xfs noatime 0 0" >> /etc/fstab
 fi
-mount -t xfs -o noatime $RAID /var/lib/scylla
-
-mkdir -p /var/lib/scylla/data
-mkdir -p /var/lib/scylla/commitlog
-mkdir -p /var/lib/scylla/coredump
-chown scylla:scylla /var/lib/scylla/*
-chown scylla:scylla /var/lib/scylla/
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -81,7 +81,7 @@ verify_package() {
 }

 list_block_devices() {
-    if lsblk --help | grep -q -e -p; then
+    if lsblk --help | grep -q -e '^\s*-p'; then
        lsblk -pnr | awk '{ print $1 }'
    else
        ls -1 /dev/sd* /dev/hd* /dev/xvd* /dev/nvme* /dev/mapper/*  2>/dev/null|grep -v control
@@ -218,6 +218,9 @@ while [ $# -gt 0 ]; do
            print_usage
            shift 1
            ;;
+        *)
+            echo "Invalid option: $@"
+            print_usage
    esac
 done

@@ -267,21 +270,24 @@ if [ $ENABLE_SERVICE -eq 1 ]; then
           printf "[housekeeping]\ncheck-version: True\n" > /etc/scylla.d/housekeeping.cfg
        fi
        if is_systemd; then
-            systemctl unmask scylla-housekeeping.timer
+            systemctl unmask scylla-housekeeping-daily.timer
+            systemctl unmask scylla-housekeeping-restart.timer
        fi
    else
        if [ ! -f /etc/scylla.d/housekeeping.cfg ]; then
           printf "[housekeeping]\ncheck-version: False\n" > /etc/scylla.d/housekeeping.cfg
        fi
        if is_systemd; then
-            systemctl mask scylla-housekeeping.timer
-            systemctl stop scylla-housekeeping.timer || true
+            systemctl mask scylla-housekeeping-daily.timer
+            systemctl mask scylla-housekeeping-restart.timer
+            systemctl stop scylla-housekeeping-daily.timer || true
+            systemctl stop scylla-housekeeping-restart.timer || true
        fi
    fi
 fi

 CUR_VERSION=`scylla --version` || true
-if [ "$CUR_VERSION" != "" ] && [ "$UUID" != "" ]; then
+if [ "$CUR_VERSION" != "" ]; then
    NEW_VERSION=`sudo -u scylla /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid version --version $CUR_VERSION --mode i` || true
    if [ "$NEW_VERSION" != "" ]; then
       echo $NEW_VERSION
--- a/dist/common/sysctl.d/99-scylla-sched.conf
+++ b/dist/common/sysctl.d/99-scylla-sched.conf
@@ -5,7 +5,7 @@ kernel.sched_tunable_scaling = 0
 kernel.sched_min_granularity_ns = 500000

 # Don't delay unrelated workloads
-kernel.sched_wakeup_granularity_ns = 500000
+kernel.sched_wakeup_granularity_ns = 450000

 # Schedule all tasks in this period
 kernel.sched_latency_ns = 1000000
--- a/dist/common/systemd/scylla-housekeeping-daily.service
+++ b/dist/common/systemd/scylla-housekeeping-daily.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=Scylla Housekeeping daily mode
+After=network.target
+
+[Service]
+Type=simple
+User=scylla
+Group=scylla
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/yum.repos.d/scylla*.repo' -q -c /etc/scylla.d/housekeeping.cfg version --mode d
+
+[Install]
+WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-housekeeping-daily.timer
+++ b/dist/common/systemd/scylla-housekeeping-daily.timer
@@ -0,0 +1,11 @@
+[Unit]
+Description=Run Scylla Housekeeping daily mode
+After=scylla-server.service
+BindsTo=scylla-server.service
+
+[Timer]
+OnActiveSec=1d
+OnUnitActiveSec=1d
+
+[Install]
+WantedBy=timers.target
--- a/dist/common/systemd/scylla-housekeeping-restart.service
+++ b/dist/common/systemd/scylla-housekeeping-restart.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=Scylla Housekeeping restart mode
+After=network.target
+
+[Service]
+Type=simple
+User=scylla
+Group=scylla
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q --repo-files '/etc/yum.repos.d/scylla*.repo' -c /etc/scylla.d/housekeeping.cfg version --mode r
+
+[Install]
+WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-housekeeping-restart.timer
+++ b/dist/common/systemd/scylla-housekeeping-restart.timer
@@ -1,12 +1,11 @@
 [Unit]
-Description=Run Scylla Housekeeping daily
+Description=Run Scylla Housekeeping restart mode
 After=scylla-server.service
 BindsTo=scylla-server.service

 [Timer]
 # set OnActiveSec to 3 to safely avoid issues/1846
 OnActiveSec=3
-OnUnitActiveSec=1d

 [Install]
 WantedBy=timers.target
--- a/dist/common/systemd/scylla-housekeeping.service
+++ b/dist/common/systemd/scylla-housekeeping.service
@@ -1,12 +0,0 @@
-[Unit]
-Description=Scylla Housekeeping
-After=network.target
-
-[Service]
-Type=simple
-User=scylla
-Group=scylla
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg version --mode d
-
-[Install]
-WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-server.service.in
+++ b/dist/common/systemd/scylla-server.service.in
@@ -2,7 +2,8 @@
 Description=Scylla Server
 After=network.target
 Wants=scylla-jmx.service
-Wants=scylla-housekeeping.timer
+Wants=scylla-housekeeping-restart.timer
+Wants=scylla-housekeeping-daily.timer

 [Service]
 PermissionsStartOnly=true
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -7,6 +7,14 @@ print_usage() {
    echo "  --rebuild-dep  rebuild dependency packages"
    exit 1
 }
+install_deps() {
+    echo Y | sudo mk-build-deps
+    DEB_FILE=`ls *-build-deps*.deb`
+    sudo gdebi -n $DEB_FILE
+    sudo rm -f $DEB_FILE
+    sudo dpkg -P ${DEB_FILE%%_*.deb}
+}
+
 REBUILD=0
 DIST=0
 while [ $# -gt 0 ]; do
@@ -54,6 +62,9 @@ fi
 if [ ! -f /usr/bin/lsb_release ]; then
    sudo apt-get -y install lsb-release
 fi
+if [ ! -f /usr/bin/gdebi ]; then
+    sudo apt-get -y install gdebi-core
+fi

 DISTRIBUTION=`lsb_release -i|awk '{print $3}'`
 CODENAME=`lsb_release -c|awk '{print $2}'`
@@ -84,7 +95,8 @@ if [ "$DISTRIBUTION" = "Debian" ]; then
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++-5, libunwind-dev/g" debian/control
    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
-    sed -i -e "s#@@HKDOTTIMER@@#dist/common/systemd/scylla-housekeeping.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_R@@#dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system#g" debian/scylla-server.install
    sed -i -e "s#@@SYSCTL@@#dist/debian/sysctl.d/99-scylla.conf etc/sysctl.d#g" debian/scylla-server.install
 elif [ "$VERSION_ID" = "14.04" ]; then
    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
@@ -92,7 +104,8 @@ elif [ "$VERSION_ID" = "14.04" ]; then
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/g++-5, libunwind8-dev/g" debian/control
    sed -i -e "s#@@INSTALL@@#dist/debian/sudoers.d/scylla etc/sudoers.d#g" debian/scylla-server.install
-    sed -i -e "s#@@HKDOTTIMER@@##g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_D@@##g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_R@@##g" debian/scylla-server.install
    sed -i -e "s#@@SYSCTL@@#dist/debian/sysctl.d/99-scylla.conf etc/sysctl.d#g" debian/scylla-server.install
 else
    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
@@ -100,7 +113,8 @@ else
    sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind-dev/g" debian/control
    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
-    sed -i -e "s#@@HKDOTTIMER@@#dist/common/systemd/scylla-housekeeping.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_D@@#dist/common/systemd/scylla-housekeeping-daily.timer /lib/systemd/system#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER_R@@#dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system#g" debian/scylla-server.install
    sed -i -e "s#@@SYSCTL@@##g" debian/scylla-server.install
 fi
 if [ $DIST -gt 0 ]; then
@@ -116,7 +130,8 @@ fi

 cp dist/common/systemd/scylla-server.service.in debian/scylla-server.service
 sed -i -e "s#@@SYSCONFDIR@@#/etc/default#g" debian/scylla-server.service
-cp dist/common/systemd/scylla-housekeeping.service debian/scylla-server.scylla-housekeeping.service
+cp dist/common/systemd/scylla-housekeeping-daily.service debian/scylla-server.scylla-housekeeping-daily.service
+cp dist/common/systemd/scylla-housekeeping-restart.service debian/scylla-server.scylla-housekeeping-restart.service
 cp dist/common/systemd/node-exporter.service debian/scylla-server.node-exporter.service

 if [ "$VERSION_ID" = "14.04" ] && [ $REBUILD -eq 0 ]; then
@@ -140,5 +155,5 @@ else
    sudo apt-get install g++
 fi

-echo Y | sudo mk-build-deps -i -r
+install_deps
 debuild -r fakeroot -us -uc
--- a/dist/debian/control.in
+++ b/dist/debian/control.in
@@ -4,7 +4,7 @@ Homepage: http://scylladb.com
 Section: database
 Priority: optional
 Standards-Version: 3.9.5
-Build-Depends: debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, libthrift-dev, thrift-compiler, antlr3, antlr3-c++-dev, ragel, ninja-build, git, libboost-program-options1.55-dev | libboost-program-options-dev, libboost-filesystem1.55-dev | libboost-filesystem-dev, libboost-system1.55-dev | libboost-system-dev, libboost-thread1.55-dev | libboost-thread-dev, libboost-test1.55-dev | libboost-test-dev, libgnutls28-dev, libhwloc-dev, libnuma-dev, libpciaccess-dev, xfslibs-dev, python3-pyparsing, libxml2-dev, libsctp-dev, python-urwid, pciutils, libprotobuf-dev, protobuf-compiler, systemtap-sdt-dev, @@BUILD_DEPENDS@@
+Build-Depends: debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, libthrift-dev, thrift-compiler, antlr3, antlr3-c++-dev, ragel, ninja-build, git, libboost-program-options1.55-dev | libboost-program-options-dev, libboost-filesystem1.55-dev | libboost-filesystem-dev, libboost-system1.55-dev | libboost-system-dev, libboost-thread1.55-dev | libboost-thread-dev, libboost-test1.55-dev | libboost-test-dev, libgnutls28-dev, libhwloc-dev, libnuma-dev, libpciaccess-dev, xfslibs-dev, python3-pyparsing, libxml2-dev, libsctp-dev, python-urwid, pciutils, libprotobuf-dev, protobuf-compiler, systemtap-sdt-dev, libtool, automake, @@BUILD_DEPENDS@@

 Package: scylla-conf
 Architecture: any
--- a/dist/debian/debian/scylla-server.scylla-housekeeping.upstart
+++ b/dist/debian/debian/scylla-server.scylla-housekeeping.upstart
@@ -29,10 +29,10 @@ setgid scylla
 script
 # make sure scylla is up before checking for the version
  sleep 5
-  /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -c /etc/scylla.d/housekeeping.cfg  -q version --mode r || true
+  /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/apt/sources.list.d/scylla*.list' -c /etc/scylla.d/housekeeping.cfg  -q version --mode r || true
  while [ 1 ]
  do
    sleep 1d
-    /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -c /etc/scylla.d/housekeeping.cfg  -q version --mode d || true
+    /usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/apt/sources.list.d/scylla*.list' -c /etc/scylla.d/housekeeping.cfg  -q version --mode d || true
  done
 end script
--- a/dist/debian/debian/scylla-server.upstart
+++ b/dist/debian/debian/scylla-server.upstart
@@ -41,6 +41,7 @@ script
        fi
        . "$i"
    done
+    export SCYLLA_CONF SCYLLA_HOME
    exec /usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET
 end script

--- a/dist/debian/dep/build_dependency.sh
+++ b/dist/debian/dep/build_dependency.sh
@@ -1,7 +1,25 @@
 #!/bin/bash -e

 . /etc/os-release
+install_deps() {
+    echo Y | sudo mk-build-deps
+    DEB_FILE=`ls *-build-deps*.deb`
+    sudo gdebi -n $DEB_FILE
+    sudo rm -f $DEB_FILE
+    sudo dpkg -P ${DEB_FILE%%_*.deb}
+}
+
 DISTRIBUTION=`lsb_release -i|awk '{print $3}'`
+CODENAME=`lsb_release -c|awk '{print $2}'`
+
+# workaround fix for #2444
+if [ "$CODENAME" = "jessie" ]; then
+    if [ ! -e /etc/apt/sources.list.d/jessie-backports.list ]; then
+        sudo sh -c 'echo deb "http://httpredir.debian.org/debian jessie-backports main" > /etc/apt/sources.list.d/jessie-backports.list'
+    fi
+    sudo apt-get -y update
+    sudo apt-get install -t jessie-backports -y texlive
+fi

 sudo apt-get install -y gdebi-core
 if [ "$VERSION_ID" = "14.04" ] || [ "$DISTRIBUTION" = "Debian" ]; then
@@ -11,7 +29,7 @@ if [ "$VERSION_ID" = "14.04" ] || [ "$DISTRIBUTION" = "Debian" ]; then
        cp -a dist/debian/dep/antlr3-3.5.2/* build/antlr3-3.5.2
        cd build/antlr3-3.5.2
        wget -nv http://www.antlr3.org/download/antlr-3.5.2-complete-no-st3.jar
-        echo Y | sudo mk-build-deps -i -r
+        install_deps
        debuild -r fakeroot --no-tgz-check -us -uc
        cd -
    fi
@@ -39,7 +57,7 @@ if [ "$VERSION_ID" = "14.04" ] || [ "$DISTRIBUTION" = "Debian" ]; then
        cd -
        cd build/gdb-7.11
        patch -p0 < ../../dist/debian/dep/gdb.diff
-        echo Y | sudo mk-build-deps -i -r
+        install_deps
        debuild -r fakeroot --no-tgz-check -us -uc
        cd -
    fi
@@ -56,7 +74,7 @@ if [ ! -f build/antlr3-c++-dev_*.deb ]; then
    cd -
    cp -a dist/debian/dep/antlr3-c++-dev-3.5.2/debian build/antlr3-c++-dev-3.5.2
    cd build/antlr3-c++-dev-3.5.2
-    echo Y | sudo mk-build-deps -i -r
+    install_deps
    debuild -r fakeroot --no-tgz-check -us -uc
    cd -
 fi
@@ -70,17 +88,18 @@ if [ ! -f build/libthrift0_*.deb ]; then
    tar xpf thrift-0.9.3.tar.gz
    cd thrift-0.9.3
    patch -p0 < ../../dist/debian/dep/thrift.diff
-    echo Y | sudo mk-build-deps -i -r
+    install_deps
    debuild -r fakeroot --no-tgz-check -us -uc
    cd ../..
 fi

 if [ "$DISTRIBUTION" = "Debian" ] && [ "$VERSION_ID" = "8" ]; then
    if [ ! -f build/gcc-5_*.deb ]; then
-        sudo cp dist/debian/dep/debian-stretch-source.list /etc/apt/sources.list.d/
-        sudo apt-get update
        cd build
-        apt-get source gcc-5/stretch=5.4.1-2
+        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1-5.dsc
+        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1.orig.tar.gz
+        wget https://launchpad.net/debian/+archive/primary/+files/gcc-5_5.4.1-5.diff.gz
+        dpkg-source -x gcc-5_5.4.1-5.dsc
        cd gcc-5-5.4.1
        # resolve build time dependencies manually, since mk-build-deps doesn't works for gcc package
        sudo apt-get install -y g++-multilib libc6-dev-i386 lib32gcc1 libc6-dev-x32 libx32gcc1 libc6-dbg m4 libtool autoconf2.64 autogen gawk zlib1g-dev systemtap-sdt-dev gperf bison flex gdb texinfo locales sharutils libantlr-java libffi-dev gnat-4.9 libisl-dev libmpc-dev libmpfr-dev libgmp-dev dejagnu realpath chrpath quilt doxygen graphviz ghostscript texlive-latex-base xsltproc libxml2-utils docbook-xsl-ns
--- a/dist/debian/dep/debian-gcc-5-jessie.diff
+++ b/dist/debian/dep/debian-gcc-5-jessie.diff
@@ -1,6 +1,5 @@
-diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
--- debian/rules.conf	2016-10-14 04:54:21.000000000 +0000
-+++ /home/syuu/gcc-5-5.4.1/debian/rules.conf	2016-10-12 17:28:54.138711378 +0000
+--- debian/rules.conf	2017-02-24 19:02:52.000000000 +0000
+++ /home/syuu/gcc-5.5/gcc-5-5.4.1/debian/rules.conf	2017-02-24 18:13:59.000000000 +0000
@@ -206,7 +206,7 @@
   ifneq (,$(filter $(distrelease),vivid))
     BINUTILSBDV = 2.25-3~
@@ -10,14 +9,16 @@ diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
   else ifneq (,$(filter $(distrelease),sid stretch xenial))
     BINUTILSBDV = 2.26.1
   endif
-@@ -387,9 +387,9 @@
+@@ -386,10 +386,10 @@
+   MPFR_BUILD_DEP = libmpfr-dev (>= 3.0.0-9~),
 endif
 
- ISL_BUILD_DEP = libisl-dev,
-ifneq (,$(filter $(distrelease),jessie sid experimental))
+-ISL_BUILD_DEP = libisl-dev,
+-ifneq (,$(filter $(distrelease),jessie stretch sid experimental))
 -  ISL_BUILD_DEP = libisl-dev (>= 0.14),
 -endif
-+#ifneq (,$(filter $(distrelease),jessie sid experimental))
+#ISL_BUILD_DEP = libisl-dev,
+#ifneq (,$(filter $(distrelease),jessie stretch sid experimental))
 +#  ISL_BUILD_DEP = libisl-dev (>= 0.14),
 +#endif
 
@@ -37,9 +38,8 @@ diff -Nur debian/rules.conf /home/syuu/gcc-5-5.4.1/debian/rules.conf
 ifneq ($(DEB_CROSS),yes)
 # all archs for which to create b-d's
 any_archs = alpha amd64 armel armhf arm64 i386 mips mipsel mips64 mips64el powerpc ppc64 ppc64el m68k sh4 sparc64 s390x x32
-diff -Nur debian/rules.defs /home/syuu/gcc-5-5.4.1/debian/rules.defs
--- debian/rules.defs	2016-10-14 04:54:21.000000000 +0000
-+++ /home/syuu/gcc-5-5.4.1/debian/rules.defs	2016-10-13 10:18:51.647631508 +0000
+--- debian/rules.defs	2017-02-24 19:02:52.000000000 +0000
+++ /home/syuu/gcc-5.5/gcc-5-5.4.1/debian/rules.defs	2017-02-24 18:13:59.000000000 +0000
@@ -412,7 +412,7 @@
 # gcc versions (fixincludes, libgcj-common) ...
 #with_common_pkgs := yes
--- a/dist/debian/dep/debian-stretch-source.list
+++ b/dist/debian/dep/debian-stretch-source.list
@@ -1,2 +0,0 @@
-deb-src http://httpredir.debian.org/debian stretch main
-deb-src http://httpredir.debian.org/debian stretch-updates main
--- a/dist/debian/rules.in
+++ b/dist/debian/rules.in
@@ -11,7 +11,8 @@ override_dh_auto_clean:

 override_dh_installinit:
 	dh_installinit --no-start @@DH_INSTALLINIT@@
-	dh_installinit --no-start --name scylla-housekeeping @@DH_INSTALLINIT@@
+	dh_installinit --no-start --name scylla-housekeeping-daily @@DH_INSTALLINIT@@
+	dh_installinit --no-start --name scylla-housekeeping-restart @@DH_INSTALLINIT@@
 	dh_installinit --no-start --name node-exporter @@DH_INSTALLINIT@@

 override_dh_strip:
--- a/dist/debian/scylla-server.install.in
+++ b/dist/debian/scylla-server.install.in
@@ -15,6 +15,7 @@ build/release/iotune usr/bin
 dist/common/bin/scyllatop usr/bin
 dist/common/sbin/* usr/sbin
@@ADDHKCFG@@
-@@HKDOTTIMER@@
+@@HKDOTTIMER_D@@
+@@HKDOTTIMER_R@@
@@INSTALL@@
@@SYSCTL@@
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -7,7 +7,7 @@ ENV container docker
 VOLUME [ "/sys/fs/cgroup" ]

 #install scylla
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-1.7.repo -o /etc/yum.repos.d/scylla.repo
 RUN yum -y install epel-release
 RUN yum -y clean expire-cache
 RUN yum -y update
@@ -38,6 +38,6 @@ ADD commandlineparser.py /commandlineparser.py
 ADD docker-entrypoint.py /docker-entrypoint.py
 ENTRYPOINT ["/docker-entrypoint.py"]

-EXPOSE 10000 9042 9160 7000 7001
+EXPOSE 10000 9042 9160 9180 7000 7001
 VOLUME [ "/var/lib/scylla" ]
 RUN chown -R scylla.scylla /var/lib/scylla
--- a/dist/redhat/centos_dep/build_dependency.sh
+++ b/dist/redhat/centos_dep/build_dependency.sh
@@ -28,10 +28,6 @@ if [ ! -f boost-1.58.0-11.fc23.src.rpm ]; then
    wget -nv https://kojipkgs.fedoraproject.org//packages/boost/1.58.0/11.fc23/src/boost-1.58.0-11.fc23.src.rpm
 fi

-if [ ! -f ninja-build-1.6.0-2.fc23.src.rpm ]; then
-    wget -nv https://kojipkgs.fedoraproject.org//packages/ninja-build/1.6.0/2.fc23/src/ninja-build-1.6.0-2.fc23.src.rpm
-fi
-
 if [ ! -f ragel-6.8-5.fc23.src.rpm ]; then
   wget -nv https://kojipkgs.fedoraproject.org//packages/ragel/6.8/5.fc23/src/ragel-6.8-5.fc23.src.rpm
 fi
@@ -94,13 +90,6 @@ if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-boost-1.58.0-11.el7*.x86_64.rpm ]; then
 fi
 do_install scylla-boost*

-if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-ninja-build-1.6.0-2.el7*.x86_64.rpm ]; then
-   rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ninja-build-1.6.0-2.fc23.src.rpm
-   patch $RPMBUILD/SPECS/ninja-build.spec < dist/redhat/centos_dep/ninja-build.diff
-   rpmbuild --define "_topdir $RPMBUILD" -ba $RPMBUILD/SPECS/ninja-build.spec
-fi
-do_install scylla-ninja-build-1.6.0-2.el7*.x86_64.rpm
-
 if [ ! -f $RPMBUILD/RPMS/x86_64/scylla-ragel-6.8-5.el7*.x86_64.rpm ]; then
    rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ragel-6.8-5.fc23.src.rpm
    patch $RPMBUILD/SPECS/ragel.spec < dist/redhat/centos_dep/ragel.diff
--- a/dist/redhat/centos_dep/ninja-build.diff
+++ b/dist/redhat/centos_dep/ninja-build.diff
@@ -1,56 +0,0 @@
--- ninja-build.spec.orig	2016-01-20 14:41:16.892802134 +0000
-+++ ninja-build.spec	2016-01-20 14:44:42.453227192 +0000
-@@ -1,19 +1,18 @@
-Name:           ninja-build
-+Name:           scylla-ninja-build
- Version:        1.6.0
- Release:        2%{?dist}
- Summary:        A small build system with a focus on speed
- License:        ASL 2.0
- URL:            http://martine.github.com/ninja/
- Source0:        https://github.com/martine/ninja/archive/v%{version}.tar.gz#/ninja-%{version}.tar.gz
-Source1:        ninja.vim
- # Rename mentions of the executable name to be ninja-build.
- Patch1000:      ninja-1.6.0-binary-rename.patch
-+Requires:	scylla-env
- BuildRequires:  asciidoc
- BuildRequires:  gtest-devel
- BuildRequires:  python2-devel
-BuildRequires:  re2c >= 0.11.3
-Requires:       emacs-filesystem
-Requires:       vim-filesystem
-+#BuildRequires:  scylla-re2c >= 0.11.3
-+%define _prefix /opt/scylladb
- 
- %description
- Ninja is a small build system with a focus on speed. It differs from other
-@@ -32,15 +31,8 @@
- ./ninja -v ninja_test
- 
- %install
-# TODO: Install ninja_syntax.py?
-mkdir -p %{buildroot}/{%{_bindir},%{_datadir}/bash-completion/completions,%{_datadir}/emacs/site-lisp,%{_datadir}/vim/vimfiles/syntax,%{_datadir}/vim/vimfiles/ftdetect,%{_datadir}/zsh/site-functions}
-
-+mkdir -p %{buildroot}/opt/scylladb/bin
- install -pm755 ninja %{buildroot}%{_bindir}/ninja-build
-install -pm644 misc/bash-completion %{buildroot}%{_datadir}/bash-completion/completions/ninja-bash-completion
-install -pm644 misc/ninja-mode.el %{buildroot}%{_datadir}/emacs/site-lisp/ninja-mode.el
-install -pm644 misc/ninja.vim %{buildroot}%{_datadir}/vim/vimfiles/syntax/ninja.vim
-install -pm644 %{SOURCE1} %{buildroot}%{_datadir}/vim/vimfiles/ftdetect/ninja.vim
-install -pm644 misc/zsh-completion %{buildroot}%{_datadir}/zsh/site-functions/_ninja
- 
- %check
- # workaround possible too low default limits
-@@ -50,12 +42,6 @@
- %files
- %doc COPYING HACKING.md README doc/manual.html
- %{_bindir}/ninja-build
-%{_datadir}/bash-completion/completions/ninja-bash-completion
-%{_datadir}/emacs/site-lisp/ninja-mode.el
-%{_datadir}/vim/vimfiles/syntax/ninja.vim
-%{_datadir}/vim/vimfiles/ftdetect/ninja.vim
-# zsh does not have a -filesystem package
-%{_datadir}/zsh/
- 
- %changelog
- * Mon Nov 16 2015 Ben Boeckel <mathstuf@gmail.com> - 1.6.0-2
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -27,9 +27,9 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler libunwind-devel systemtap-sdt-devel
-%{?fedora:BuildRequires: boost-devel ninja-build ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
-%{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-boost-static scylla-ninja-build scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
+BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler libunwind-devel systemtap-sdt-devel libtool automake ninja-build
+%{?fedora:BuildRequires: boost-devel ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
+%{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-boost-static scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
 Requires:       scylla-conf systemd-libs hwloc collectd PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils
 %{?rhel:Requires: python34 python34-PyYAML}
 Conflicts:      abrt
@@ -63,6 +63,9 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
+%if 0%{?rhel}
+mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
+%endif
 mkdir -p $RPM_BUILD_ROOT%{_sysctldir}/
 mkdir -p $RPM_BUILD_ROOT%{_docdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_unitdir}
@@ -73,6 +76,9 @@ install -m644 dist/common/limits.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/sec
 install -m644 dist/common/collectd.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 install -m644 dist/common/scylla.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
 install -m644 dist/common/sysctl.d/*.conf $RPM_BUILD_ROOT%{_sysctldir}/
+%if 0%{?rhel}
+install -m644 dist/common/modprobe.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
+%endif
 install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
 install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
@@ -151,10 +157,8 @@ rm -rf $RPM_BUILD_ROOT
 %{_docdir}/scylla/NOTICE.txt
 %{_docdir}/scylla/ORIGIN
 %{_docdir}/scylla/licenses/
-%{_unitdir}/scylla-server.service
-%{_unitdir}/scylla-housekeeping.service
-%{_unitdir}/scylla-housekeeping.timer
-%{_unitdir}/node-exporter.service
+%{_unitdir}/*.service
+%{_unitdir}/*.timer
 %{_bindir}/scylla
 %{_bindir}/iotune
 %{_bindir}/scyllatop
@@ -228,6 +232,7 @@ Group:          Applications/Databases
 Summary:        Scylla configuration package for the Linux kernel
 License:        AGPLv3
 URL:            http://www.scylladb.com/
+Requires:       kmod

 %description kernel-conf
 This package contains Linux kernel configuration changes for the Scylla database.  Install this package
@@ -237,9 +242,18 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
+# Write modprobe.d params when module already loaded
+%if 0%{?rhel}
+if [ -e /sys/module/raid0/parameters/devices_discard_performance ]; then
+    echo Y > /sys/module/raid0/parameters/devices_discard_performance
+fi
+%endif

 %files kernel-conf
 %defattr(-,root,root)
+%if 0%{?rhel}
+%config(noreplace) %{_sysconfdir}/modprobe.d/*.conf
+%endif
 %{_sysctldir}/*.conf

 %changelog
--- a/gc_clock.hh
+++ b/gc_clock.hh
@@ -50,6 +50,12 @@ public:
    // for real time waits.
 };

+// Returns a time point which is earlier from t by d, or minimum time point if it cannot be represented.
+template<typename Clock, typename Duration, typename Rep, typename Period>
+inline
+auto saturating_subtract(std::chrono::time_point<Clock, Duration> t, std::chrono::duration<Rep, Period> d) -> decltype(t) {
+    return std::max(t, decltype(t)::min() + d) - d;
+}

 using expiry_opt = std::experimental::optional<gc_clock::time_point>;
 using ttl_opt = std::experimental::optional<gc_clock::duration>;
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1135,6 +1135,15 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
    //     real_mark_alive(addr, local_state);
    //     return;
    // }
+    auto inserted = _pending_mark_alive_endpoints.insert(addr).second;
+    if (inserted) {
+        // The node is not in the _pending_mark_alive_endpoints
+        logger.debug("Mark Node {} alive with EchoMessage", addr);
+    } else {
+        // We are in the progress of marking this node alive
+        logger.debug("Node {} is being marked as up, ignoring duplicated mark alive operation", addr);
+        return;
+    }

    local_state.mark_dead();
    msg_addr id = get_msg_addr(addr);
@@ -1143,10 +1152,22 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
        ms().send_gossip_echo(id).get();
        logger.trace("Got EchoMessage Reply");
        set_last_processed_message_at();
-        real_mark_alive(id.addr, local_state);
+        // After sending echo message, the Node might not be in the
+        // endpoint_state_map anymore, use the reference of local_state
+        // might cause user-after-free
+        auto it = endpoint_state_map.find(addr);
+        if (it == endpoint_state_map.end()) {
+            logger.info("Node {} is not in endpoint_state_map anymore", addr);
+        } else {
+            endpoint_state& state = it->second;
+            logger.debug("Mark Node {} alive after EchoMessage", addr);
+            real_mark_alive(addr, state);
+        }
    } catch(...) {
        logger.warn("Fail to send EchoMessage to {}: {}", id, std::current_exception());
    }
+
+    _pending_mark_alive_endpoints.erase(addr);
 }

 // Runs inside seastar::async context
@@ -1188,10 +1209,7 @@ void gossiper::mark_dead(inet_address addr, endpoint_state& local_state) {

 // Runs inside seastar::async context
 void gossiper::handle_major_state_change(inet_address ep, const endpoint_state& eps) {
-    std::experimental::optional<endpoint_state> local_ep_state;
-    if (endpoint_state_map.count(ep) > 0) {
-        local_ep_state = endpoint_state_map.at(ep);
-    }
+    auto eps_old = get_endpoint_state_for_endpoint(ep);
    if (!is_dead_state(eps) && !_in_shadow_round) {
        if (endpoint_state_map.count(ep))  {
            logger.debug("Node {} has restarted, now UP, status = {}", ep, get_gossip_status(eps));
@@ -1202,24 +1220,37 @@ void gossiper::handle_major_state_change(inet_address ep, const endpoint_state&
    logger.trace("Adding endpoint state for {}, status = {}", ep, get_gossip_status(eps));
    endpoint_state_map[ep] = eps;

-    auto& ep_state = endpoint_state_map.at(ep);
+    if (_in_shadow_round) {
+        // In shadow round, we only interested in the peer's endpoint_state,
+        // e.g., gossip features, host_id, tokens. No need to call the
+        // on_restart or on_join callbacks or to go through the mark alive
+        // procedure with EchoMessage gossip message. We will do them during
+        // normal gossip runs anyway.
+        logger.debug("In shadow round addr={}, eps={}", ep, eps);
+        return;
+    }

-    if (local_ep_state) {
+    if (eps_old) {
        // the node restarted: it is up to the subscriber to take whatever action is necessary
-        _subscribers.for_each([ep, local_ep_state] (auto& subscriber) {
-            subscriber->on_restart(ep, *local_ep_state);
+        _subscribers.for_each([ep, eps_old] (auto& subscriber) {
+            subscriber->on_restart(ep, *eps_old);
        });
    }

+    auto& ep_state = endpoint_state_map.at(ep);
    if (!is_dead_state(ep_state)) {
        mark_alive(ep, ep_state);
    } else {
        logger.debug("Not marking {} alive due to dead state {}", ep, get_gossip_status(eps));
        mark_dead(ep, ep_state);
    }
-    _subscribers.for_each([ep, ep_state] (auto& subscriber) {
-        subscriber->on_join(ep, ep_state);
-    });
+
+    auto eps_new = get_endpoint_state_for_endpoint(ep);
+    if (eps_new) {
+        _subscribers.for_each([ep, eps_new] (auto& subscriber) {
+            subscriber->on_join(ep, *eps_new);
+        });
+    }
    // check this at the end so nodes will learn about the endpoint
    if (is_shutdown(ep)) {
        mark_as_shutdown(ep);
@@ -1394,9 +1425,11 @@ future<> gossiper::start_gossiping(int generation_nbr, std::map<application_stat
            local_state.add_application_state(entry.first, entry.second);
        }

+        auto generation = local_state.get_heart_beat_state().get_generation();
+
        //notify snitches that Gossiper is about to start
-        return locator::i_endpoint_snitch::get_local_snitch_ptr()->gossiper_starting().then([this, &local_state] {
-            logger.trace("gossip started with generation {}", local_state.get_heart_beat_state().get_generation());
+        return locator::i_endpoint_snitch::get_local_snitch_ptr()->gossiper_starting().then([this, generation] {
+            logger.trace("gossip started with generation {}", generation);
            _enabled = true;
            _nr_run = 0;
            _scheduled_gossip_task.arm(INTERVAL);
@@ -1493,16 +1526,19 @@ future<> gossiper::add_local_application_state(application_state state, versione
                logger.error(err.c_str());
                throw std::runtime_error(err);
            }
-            endpoint_state& ep_state = gossiper.endpoint_state_map.at(ep_addr);
+            endpoint_state ep_state_before = gossiper.endpoint_state_map.at(ep_addr);
            // Fire "before change" notifications:
-            gossiper.do_before_change_notifications(ep_addr, ep_state, state, value);
+            gossiper.do_before_change_notifications(ep_addr, ep_state_before, state, value);
            // Notifications may have taken some time, so preventively raise the version
            // of the new value, otherwise it could be ignored by the remote node
            // if another value with a newer version was received in the meantime:
            value = storage_service_value_factory().clone_with_higher_version(value);
            // Add to local application state and fire "on change" notifications:
-            ep_state.add_application_state(state, value);
-            gossiper.do_on_change_notifications(ep_addr, state, value);
+            if (gossiper.endpoint_state_map.count(ep_addr)) {
+                auto& ep_state = gossiper.endpoint_state_map.at(ep_addr);
+                ep_state.add_application_state(state, value);
+                gossiper.do_on_change_notifications(ep_addr, state, value);
+            }
        }).handle_exception([] (auto ep) {
            logger.warn("Fail to apply application_state: {}", ep);
        });
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -187,6 +187,9 @@ private:
    std::set<inet_address> _live_endpoints;
    std::list<inet_address> _live_endpoints_just_added;

+    /* nodes are being marked as alive */
+    std::unordered_set<inet_address> _pending_mark_alive_endpoints;
+
    /* unreachable member set */
    std::map<inet_address, clk::time_point> _unreachable_endpoints;

--- a/idl-compiler.py
+++ b/idl-compiler.py
@@ -277,6 +277,9 @@ def is_optional(lst):

 created_writers = set()

+def get_member_name(name):
+    return name if not name.endswith('()') else name[:-2]
+
 def get_members(cls):
    return [p for p in cls["members"] if not is_class(p) and not is_enum(p)]

@@ -456,18 +459,19 @@ def add_param_writer_object(name, base_state, typ, var_type = "", var_index = No
 def add_param_write(current, base_state, vector = False, root_node = False):
    typ = current["type"]
    res = ""
+    name = get_member_name(current["name"])
    if is_basic_type(typ):
-        res = res + add_param_writer_basic_type(current["name"], base_state, typ)
+        res = res + add_param_writer_basic_type(name, base_state, typ)
    elif is_optional(typ):
            res = res +  Template(reindent(4, """
    after_${basestate}__$name<Output> skip_$name() && {
        serialize(_out, false);
        return { _out, std::move(_state) };
-    }""")).substitute({'type': param_type(typ), 'name': current["name"], 'basestate' : base_state})
+    }""")).substitute({'type': param_type(typ), 'name': name, 'basestate' : base_state})
            if is_basic_type(typ[1][0]):
-                res = res + add_param_writer_basic_type(current["name"], base_state, typ[1][0], "", "true")
+                res = res + add_param_writer_basic_type(name, base_state, typ[1][0], "", "true")
            elif is_local_type(typ[1][0]):
-                res = res + add_param_writer_object(current["name"], base_state[0][1], typ, "", "true")
+                res = res + add_param_writer_object(name, base_state[0][1], typ, "", "true")
            else:
                print("non supported optional type ", type[0][1])
    elif is_vector(typ):
@@ -482,18 +486,18 @@ def add_param_write(current, base_state, vector = False, root_node = False):
        $set
        return { _out, std::move(_state) };
    }
-""").substitute({'type': param_type(typ), 'name': current["name"], 'basestate' : base_state, 'set' : set_size})
+""").substitute({'type': param_type(typ), 'name': name, 'basestate' : base_state, 'set' : set_size})
    elif is_local_type(typ):
-        res = res + add_param_writer_object(current["name"], base_state, typ)
+        res = res + add_param_writer_object(name, base_state, typ)
    elif is_variant(typ):
        for idx, p in enumerate(typ[1]):
            if is_basic_type(p):
                varient_type = param_type(p)
-                res = res + add_param_writer_basic_type(current["name"], base_state, varient_type,"_" + varient_type, idx, root_node)
+                res = res + add_param_writer_basic_type(name, base_state, varient_type,"_" + varient_type, idx, root_node)
            elif is_variant(p):
-                res = res + add_param_writer_object(current["name"], base_state, p, '_' + "variant", idx, root_node)
+                res = res + add_param_writer_object(name, base_state, p, '_' + "variant", idx, root_node)
            elif is_local_type(p):
-                res = res + add_param_writer_object(current["name"], base_state, p, '_' + param_type(p), idx, root_node)
+                res = res + add_param_writer_object(name, base_state, p, '_' + param_type(p), idx, root_node)
    else:
        print ("something is wrong with type", typ)
    return res;
@@ -658,7 +662,7 @@ def handle_visitors_nodes(info, hout, variant_node = False, clases = []):
    if not members:
        add_node(hout, base_state_name, None, base_state_name, prefix, parents, add_end_method(parents, current_name, variant_node, clases), False, is_final(cls))
        return
-    add_node(hout, base_state_name + "__" + members[-1]["name"], members[-1]["type"], base_state_name, "after_", base_state_name, add_end_method(parents, current_name, variant_node, clases))
+    add_node(hout, base_state_name + "__" + get_member_name(members[-1]["name"]), members[-1]["type"], base_state_name, "after_", base_state_name, add_end_method(parents, current_name, variant_node, clases))
    # Create writer and reader for include class
    if not variant_node:
        for member in get_dependency(cls):
@@ -666,9 +670,9 @@ def handle_visitors_nodes(info, hout, variant_node = False, clases = []):
    for ind in reversed(range(1, len(members))):
        member = members[ind]
        add_nodes_when_needed(hout, info, member, base_state_name, parents, member_classes)
-        variant_state = base_state_name + "__" + member["name"] if is_variant(member["type"]) else base_state_name
+        variant_state = base_state_name + "__" + get_member_name(member["name"]) if is_variant(member["type"]) else base_state_name
        is_param_vector = is_vector(member["type"]) and  is_basic_type(member["type"][1][0])
-        add_node(hout, base_state_name + "__" + members[ind - 1]["name"], member["type"], variant_state, "after_", base_state_name, add_param_write(member, base_state_name), False)
+        add_node(hout, base_state_name + "__" + get_member_name(members[ind - 1]["name"]), member["type"], variant_state, "after_", base_state_name, add_param_write(member, base_state_name), False)
    member = members[0]
    is_param_vector = is_vector(member["type"]) and is_basic_type(member["type"][1][0])
    add_nodes_when_needed(hout, info, member, base_state_name, parents, member_classes)
@@ -790,7 +794,7 @@ def add_view(hout, info):
               return deserialize(in, boost::type<$type>());
              });
            }
-        """)).substitute({'name' : m["name"], 'type' : full_type, 'skip' : skip}))
+        """)).substitute({'name' : get_member_name(m["name"]), 'type' : full_type, 'skip' : skip}))

        skip = skip + Template("\n       ser::skip(in, boost::type<${type}>());").substitute({'type': full_type})

--- a/idl/commitlog.idl.hh
+++ b/idl/commitlog.idl.hh
@@ -19,7 +19,7 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-class commitlog_entry {
+class commitlog_entry [[writable]] {
    std::experimental::optional<column_mapping> mapping();
    frozen_mutation mutation();
-};
+};
--- a/memtable.cc
+++ b/memtable.cc
@@ -65,17 +65,15 @@ future<> memtable::clear_gently() noexcept {
        auto t = std::make_unique<seastar::thread>(attr, [this] {
            auto& alloc = allocator();

-            // entries can no longer be moved after unlink_leftmost_without_rebalance()
-            // so need to disable compaction.
-            logalloc::reclaim_lock rl(*this);
-
            auto p = std::move(partitions);
            while (!p.empty()) {
                auto batch_size = std::min<size_t>(p.size(), 32);
                auto dirty_before = dirty_size();
                with_allocator(alloc, [&] () noexcept {
                    while (batch_size--) {
-                        alloc.destroy(p.unlink_leftmost_without_rebalance());
+                        p.erase_and_dispose(p.begin(), [&] (auto e) {
+                            alloc.destroy(e);
+                        });
                    }
                });
                remove_flushed_memory(dirty_before - dirty_size());
@@ -205,19 +203,23 @@ protected:
        , _range(&range)
    { }

-    memtable_entry* fetch_next_entry() {
+    memtable_entry* fetch_entry() {
        update_iterators();
        if (_i == _end) {
            return nullptr;
        } else {
            memtable_entry& e = *_i;
-            ++_i;
-            _last = e.key();
            _memtable->upgrade_entry(e);
            return &e;
        }
    }

+    void advance() {
+        memtable_entry& e = *_i;
+        _last = e.key();
+        ++_i;
+    }
+
    logalloc::allocating_section& read_section() {
        return _memtable->_read_section;
    }
@@ -287,14 +289,18 @@ public:
            return _delegate();
        }

-        logalloc::reclaim_lock _(region());
-        managed_bytes::linearization_context_guard lcg;
-        memtable_entry* e = fetch_next_entry();
-        if (!e) {
-             return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
-        } else {
-            return make_ready_future<streamed_mutation_opt>(e->read(mtbl(), schema(), _slice));
-        }
+        return read_section()(region(), [&] {
+            return with_linearized_managed_bytes([&] {
+                memtable_entry* e = fetch_entry();
+                if (!e) {
+                    return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
+                } else {
+                    auto ret =  make_ready_future<streamed_mutation_opt>(e->read(mtbl(), schema(), _slice));
+                    advance();
+                    return ret;
+                }
+            });
+        });
    }
 };

@@ -391,19 +397,24 @@ public:
    flush_reader& operator=(const flush_reader&) = delete;

    virtual future<streamed_mutation_opt> operator()() override {
-        logalloc::reclaim_lock _(region());
-        managed_bytes::linearization_context_guard lcg;
-        memtable_entry* e = fetch_next_entry();
-        if (!e) {
-            return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
-        } else {
-            auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), query::full_slice, e->key().key());
-            auto snp = e->partition().read(schema());
-            auto mpsr = make_partition_snapshot_reader<partition_snapshot_accounter>(schema(), e->key(), std::move(cr), snp, region(), read_section(), mtbl(), _flushed_memory);
-            _flushed_memory.account_component(*e);
-            _flushed_memory.account_component(*snp);
-            return make_ready_future<streamed_mutation_opt>(std::move(mpsr));
-        }
+        return read_section()(region(), [&] {
+            return with_linearized_managed_bytes([&] {
+                memtable_entry* e = fetch_entry();
+                if (!e) {
+                    return make_ready_future<streamed_mutation_opt>(stdx::nullopt);
+                } else {
+                    auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), query::full_slice, e->key().key());
+                    auto snp = e->partition().read(schema());
+                    auto mpsr = make_partition_snapshot_reader<partition_snapshot_accounter>(schema(), e->key(), std::move(cr),
+                            snp, region(), read_section(), mtbl(), _flushed_memory);
+                    _flushed_memory.account_component(*e);
+                    _flushed_memory.account_component(*snp);
+                    auto ret = make_ready_future<streamed_mutation_opt>(std::move(mpsr));
+                    advance();
+                    return ret;
+                }
+            });
+        });
    }
 };

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -274,7 +274,13 @@ void messaging_service::start_listen() {
        if (listen_to_bc) {
            _server_tls[1] = listen(utils::fb_utilities::get_broadcast_address());
        }
-
+    }
+    // Do this on just cpu 0, to avoid duplicate logs.
+    if (engine().cpu_id() == 0) {
+        if (_server_tls[0]) {
+            logger.info("Starting Encrypted Messaging Service on SSL port {}", _ssl_port);
+        }
+        logger.info("Starting Messaging Service on port {}", _port);
    }
 }

@@ -308,14 +314,6 @@ messaging_service::messaging_service(gms::inet_address ip
    if (listen_now) {
        start_listen();
    }
-
-    // Do this on just cpu 0, to avoid duplicate logs.
-    if (engine().cpu_id() == 0) {
-        if (_server_tls[0]) {
-            logger.info("Starting Encrypted Messaging Service on SSL port {}", _ssl_port);
-        }
-        logger.info("Starting Messaging Service on port {}", _port);
-    }
 }

 msg_addr messaging_service::get_source(const rpc::client_info& cinfo) {
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -123,7 +123,7 @@ public:
              uint32_t partition_limit, CompactedMutationsConsumer consumer)
        : _schema(s)
        , _query_time(query_time)
-        , _gc_before(query_time - s.gc_grace_seconds())
+        , _gc_before(saturating_subtract(query_time, s.gc_grace_seconds()))
        , _can_gc(always_gc)
        , _slice(slice)
        , _row_limit(limit)
@@ -139,7 +139,7 @@ public:
                     std::function<api::timestamp_type(const dht::decorated_key&)> get_max_purgeable)
        : _schema(s)
        , _query_time(compaction_time)
-        , _gc_before(_query_time - s.gc_grace_seconds())
+        , _gc_before(saturating_subtract(_query_time, s.gc_grace_seconds()))
        , _get_max_purgeable(std::move(get_max_purgeable))
        , _can_gc([this] (tombstone t) { return can_gc(t); })
        , _slice(query::full_slice)
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -1183,7 +1183,7 @@ uint32_t mutation_partition::do_compact(const schema& s,
 {
    assert(row_limit > 0);

-    auto gc_before = query_time - s.gc_grace_seconds();
+    auto gc_before = saturating_subtract(query_time, s.gc_grace_seconds());

    auto should_purge_tombstone = [&] (const tombstone& t) {
        return t.deletion_time < gc_before && can_gc(t);
@@ -1526,12 +1526,19 @@ bool row::compact_and_expire(const schema& s, column_kind kind, tombstone tomb,
        const column_definition& def = s.column_at(kind, id);
        if (def.is_atomic()) {
            atomic_cell_view cell = c.as_atomic_cell();
+            auto can_erase_cell = [&] {
+                return cell.deletion_time() < gc_before && can_gc(tombstone(cell.timestamp(), cell.deletion_time()));
+            };
+
            if (cell.is_covered_by(tomb, def.is_counter())) {
                erase = true;
            } else if (cell.has_expired(query_time)) {
-                c = atomic_cell::make_dead(cell.timestamp(), cell.deletion_time());
+                erase = can_erase_cell();
+                if (!erase) {
+                    c = atomic_cell::make_dead(cell.timestamp(), cell.deletion_time());
+                }
            } else if (!cell.is_live()) {
-                erase = cell.deletion_time() < gc_before && can_gc(tombstone(cell.timestamp(), cell.deletion_time()));
+                erase = can_erase_cell();
            } else {
                any_live |= true;
            }
--- a/query-result.hh
+++ b/query-result.hh
@@ -345,7 +345,7 @@ public:
        : _w(std::move(w))
        , _row_count(c)
        , _short_read(sr)
-        , _memory_tracker(std::move(_memory_tracker))
+        , _memory_tracker(std::move(memory_tracker))
        , _partition_count(pc)
    {
        w.reduce_chunk_count();
--- a/range.hh
+++ b/range.hh
@@ -601,13 +601,13 @@ private:
    struct built_in_ : std_ {};

    template<typename Range, typename LessComparator,
-             typename = decltype(&std::remove_reference<Range>::type::lower_bound)>
+             typename = decltype(std::declval<Range>().lower_bound(std::declval<T>(), std::declval<LessComparator>()))>
    typename std::remove_reference<Range>::type::const_iterator do_lower_bound(const T& value, Range&& r, LessComparator&& cmp, built_in_) const {
        return r.lower_bound(value, std::forward<LessComparator>(cmp));
    }

    template<typename Range, typename LessComparator,
-             typename = decltype(&std::remove_reference<Range>::type::upper_bound)>
+             typename = decltype(std::declval<Range>().upper_bound(std::declval<T>(), std::declval<LessComparator>()))>
    typename std::remove_reference<Range>::type::const_iterator do_upper_bound(const T& value, Range&& r, LessComparator&& cmp, built_in_) const {
        return r.upper_bound(value, std::forward<LessComparator>(cmp));
    }
@@ -649,6 +649,21 @@ public:
        return boost::make_iterator_range(lower_bound(range, cmp), upper_bound(range, cmp));
    }

+    // Returns the intersection between this range and other.
+    template<typename Comparator>
+    stdx::optional<nonwrapping_range> intersection(const nonwrapping_range& other, Comparator&& cmp) const {
+        auto p = std::minmax(_range, other._range, [&cmp] (auto&& a, auto&& b) {
+            return wrapping_range<T>::less_than(a.start_bound(), b.start_bound(), cmp);
+        });
+        if (wrapping_range<T>::greater_than_or_equal(p.first.end_bound(), p.second.start_bound(), cmp)) {
+            auto end = std::min(p.first.end_bound(), p.second.end_bound(), [&cmp] (auto&& a, auto&& b) {
+                return !wrapping_range<T>::greater_than_or_equal(a, b, cmp);
+            });
+            return nonwrapping_range(p.second.start(), end.b);
+        }
+        return {};
+    }
+
    template<typename U>
    friend std::ostream& operator<<(std::ostream& out, const nonwrapping_range<U>& r);
 };
--- a/repair/range_split.hh
+++ b/repair/range_split.hh
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stack>
+
+#include "dht/i_partitioner.hh"
+
+// range_splitter(r, N, K) is a helper for splitting a given token_range r of
+// estimated size N into many small ranges of size K, and later iterating
+// over those small ranges once with the has_next() and next() methods.
+// This implementation assumes only the availability of a range::midpoint()
+// operation, and as result creates ranges with size between K/2 and K.
+// Moreover, it has memory requirement log(N). With more general arithmetic
+// support over tokens, we could get exactly K and O(1) memory.
+class range_splitter {
+    std::stack<std::pair<::dht::token_range, float>> _stack;
+    uint64_t _desired;
+public:
+    range_splitter(::dht::token_range r, uint64_t N, uint64_t K) {
+        _stack.push({r, N});
+        _desired = K;
+    }
+    bool has_next() const {
+        return !_stack.empty();
+    }
+    ::dht::token_range next() {
+        // If the head range's estimated size is small enough, return it.
+        // Otherwise split it to two halves, push the second half on the
+        // stack, and repeat with the first half. May need to do this more
+        // than once (up to log(N/K) times) until we have one range small
+        // enough to return.
+        assert(!_stack.empty());
+        auto range = _stack.top().first;
+        auto size = _stack.top().second;
+        _stack.pop();
+        while (size > _desired) {
+            // The use of minimum_token() here twice is not a typo - because wrap-
+            // around token ranges are supported by midpoint(), the beyond-maximum
+            // token can also be represented by minimum_token().
+            auto midpoint = dht::global_partitioner().midpoint(
+                    range.start() ? range.start()->value() : dht::minimum_token(),
+                    range.end() ? range.end()->value() : dht::minimum_token());
+            // This shouldn't happen, but if the range included just one token, we
+            // can't split further (split() may actually fail with assertion failure)
+            if ((range.start() && midpoint == range.start()->value()) ||
+                (range.end() && midpoint == range.end()->value())) {
+                return range;
+            }
+            auto halves = range.split(midpoint, dht::token_comparator());
+            _stack.push({halves.second, size / 2.0});
+            range = halves.first;
+            size /= 2.0;
+        }
+        return range;
+    }
+};
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -20,6 +20,7 @@
 */

 #include "repair.hh"
+#include "range_split.hh"

 #include "streaming/stream_plan.hh"
 #include "streaming/stream_state.hh"
@@ -55,12 +56,19 @@ public:
    std::vector<sstring> data_centers;
    std::vector<sstring> hosts;
    std::vector<failed_range> failed_ranges;
-    streaming::stream_plan sp_in;
-    streaming::stream_plan sp_out;
+    // Map of peer -> <cf, ranges>
+    std::unordered_map<gms::inet_address, std::unordered_map<sstring, dht::token_range_vector>> ranges_need_repair_in;
+    std::unordered_map<gms::inet_address, std::unordered_map<sstring, dht::token_range_vector>> ranges_need_repair_out;
    // FIXME: this "100" needs to be a parameter.
    uint64_t target_partitions = 100;
-    // FIXME: this "10 * 1024 * 1024" needs to be a parameter.
-    size_t sub_ranges_max = 10 * 1024 * 1024;
+    // This affects how many ranges we put in a stream plan. The more the more
+    // memory we use to store the ranges in memory. However, it can reduce the
+    // total number of stream_plan we use for the repair.
+    size_t sub_ranges_to_stream = 1 * 1024;
+    size_t sp_index = 0;
+    size_t current_sub_ranges_nr_in = 0;
+    size_t current_sub_ranges_nr_out = 0;
+    int ranges_index = 0;
 public:
    repair_info(seastar::sharded<database>& db_,
            const sstring& keyspace_,
@@ -75,14 +83,45 @@ public:
        , cfs(cfs_)
        , id(id_)
        , data_centers(data_centers_)
-        , hosts(hosts_)
-        , sp_in(streaming::stream_plan(sprint("repair-in-%d", id)))
-        , sp_out(streaming::stream_plan(sprint("repair-out-%d", id))) {
-
+        , hosts(hosts_) {
    }
    future<> do_streaming() {
-        return sp_in.execute().discard_result().then([this] {
-                return sp_out.execute().discard_result();
+        size_t ranges_in = 0;
+        size_t ranges_out = 0;
+        auto sp_in = make_lw_shared<streaming::stream_plan>(sprint("repair-in-%d-index-%d", id, sp_index));
+        auto sp_out = make_lw_shared<streaming::stream_plan>(sprint("repair-out-%d-index-%d", id, sp_index));
+
+        for (auto& x : ranges_need_repair_in) {
+            auto& peer = x.first;
+            for (auto& y : x.second) {
+                auto& cf = y.first;
+                auto& stream_ranges = y.second;
+                ranges_in += stream_ranges.size();
+                sp_in->request_ranges(peer, keyspace, std::move(stream_ranges), {cf});
+            }
+        }
+        ranges_need_repair_in.clear();
+        current_sub_ranges_nr_in = 0;
+
+        for (auto& x : ranges_need_repair_out) {
+            auto& peer = x.first;
+            for (auto& y : x.second) {
+                auto& cf = y.first;
+                auto& stream_ranges = y.second;
+                ranges_out += stream_ranges.size();
+                sp_out->transfer_ranges(peer, keyspace, std::move(stream_ranges), {cf});
+            }
+        }
+        ranges_need_repair_out.clear();
+        current_sub_ranges_nr_out = 0;
+
+        if (ranges_in || ranges_out) {
+            logger.info("Start streaming for repair {} index {}, ranges_in={}, ranges_out={}", id, sp_index, ranges_in, ranges_out);
+        }
+        sp_index++;
+
+        return sp_in->execute().discard_result().then([sp_in, sp_out] {
+                return sp_out->execute().discard_result();
        }).handle_exception([] (auto ep) {
            logger.warn("repair's stream failed: {}", ep);
            return make_exception_future(ep);
@@ -93,23 +132,29 @@ public:
            logger.info("repair {} completed successfully", id);
            return true;
        } else {
-            for (auto& frange: failed_ranges) {
-                logger.debug("repair cf {} range {} failed", frange.cf, frange.range);
-            }
            logger.info("repair {} failed - {} ranges failed", id, failed_ranges.size());
+            for (auto& frange: failed_ranges) {
+                logger.info("repair cf {} range {} failed", frange.cf, frange.range);
+            }
            return false;
        }
    }
-    void request_transfer_ranges(const sstring& cf,
+    future<> request_transfer_ranges(const sstring& cf,
        const ::dht::token_range& range,
        const std::vector<gms::inet_address>& neighbors_in,
        const std::vector<gms::inet_address>& neighbors_out) {
        for (const auto& peer : neighbors_in) {
-            sp_in.request_ranges(peer, keyspace, {range}, {cf});
+            ranges_need_repair_in[peer][cf].emplace_back(range);
+            current_sub_ranges_nr_in++;
        }
        for (const auto& peer : neighbors_out) {
-            sp_out.transfer_ranges(peer, keyspace, {range}, {cf});
+            ranges_need_repair_out[peer][cf].emplace_back(range);
+            current_sub_ranges_nr_out++;
        }
+        if (current_sub_ranges_nr_in >= sub_ranges_to_stream || current_sub_ranges_nr_out >= sub_ranges_to_stream) {
+            return do_streaming();
+        }
+        return make_ready_future<>();
    }
 };

@@ -478,24 +523,6 @@ future<partition_checksum> checksum_range(seastar::sharded<database> &db,
    });
 }

-static void split_and_add(std::vector<::dht::token_range>& ranges,
-        const dht::token_range& range,
-        uint64_t estimated_partitions, uint64_t target_partitions) {
-    if (estimated_partitions < target_partitions) {
-        // We're done, the range is small enough to not be split further
-        ranges.push_back(range);
-        return;
-    }
-    // The use of minimum_token() here twice is not a typo - because wrap-
-    // around token ranges are supported by midpoint(), the beyond-maximum
-    // token can also be represented by minimum_token().
-    auto midpoint = dht::global_partitioner().midpoint(
-            range.start() ? range.start()->value() : dht::minimum_token(),
-            range.end() ? range.end()->value() : dht::minimum_token());
-    auto halves = range.split(midpoint, dht::token_comparator());
-    ranges.push_back(halves.first);
-    ranges.push_back(halves.second);
-}
 // We don't need to wait for one checksum to finish before we start the
 // next, but doing too many of these operations in parallel also doesn't
 // make sense, so we limit the number of concurrent ongoing checksum
@@ -512,52 +539,46 @@ static void split_and_add(std::vector<::dht::token_range>& ranges,
 constexpr int parallelism = 100;
 static thread_local semaphore parallelism_semaphore(parallelism);

+static future<uint64_t> estimate_partitions(seastar::sharded<database>& db, const sstring& keyspace,
+        const sstring& cf, const dht::token_range& range) {
+    return db.map_reduce0(
+        [keyspace, cf, range] (auto& db) {
+            // FIXME: column_family should have a method to estimate the number of
+            // partitions (and of course it should use cardinality estimation bitmaps,
+            // not trivial sum). We shouldn't have this ugly code here...
+            // FIXME: If sstables are shared, they will be accounted more than
+            // once. However, shared sstables should exist for a short-time only.
+            auto sstables = db.find_column_family(keyspace, cf).get_sstables();
+            return boost::accumulate(*sstables, uint64_t(0),
+                [&range] (uint64_t x, auto&& sst) { return x + sst->estimated_keys_for_range(range); });
+        },
+        uint64_t(0),
+        std::plus<uint64_t>()
+    );
+}
+
 // Repair a single cf in a single local range.
 // Comparable to RepairJob in Origin.
 static future<> repair_cf_range(repair_info& ri,
        sstring cf, ::dht::token_range range,
        const std::vector<gms::inet_address>& neighbors) {
+    ri.ranges_index++;
    if (neighbors.empty()) {
        // Nothing to do in this case...
        return make_ready_future<>();
    }

-    std::vector<::dht::token_range> ranges;
-    ranges.push_back(range);
-
-    // Additionally, we want to break up large ranges so they will have
-    // (approximately) a desired number of rows each.
-    // FIXME: column_family should have a method to estimate the number of
-    // partitions (and of course it should use cardinality estimation bitmaps,
-    // not trivial sum). We shouldn't have this ugly code here...
-    auto sstables = ri.db.local().find_column_family(ri.keyspace, cf).get_sstables();
-    uint64_t estimated_partitions = 0;
-    for (auto sst : *sstables) {
-        estimated_partitions += sst->estimated_keys_for_range(range);
-    }
-
-    // FIXME: we should have an on-the-fly iterator generator here, not
-    // fill a vector in advance.
-    std::vector<::dht::token_range> tosplit;
-    while (estimated_partitions > ri.target_partitions) {
-        tosplit.clear();
-        ranges.swap(tosplit);
-        for (const auto& range : tosplit) {
-            split_and_add(ranges, range, estimated_partitions, ri.target_partitions);
-        }
-        estimated_partitions /= 2;
-        if (ranges.size() >= ri.sub_ranges_max) {
-            break;
-        }
-    }
-    logger.debug("target_partitions={}, estimated_partitions={}, ranges.size={}, range={} -> ranges={}",
-                  ri.target_partitions, estimated_partitions, ranges.size(), range, ranges);
-
+    return estimate_partitions(ri.db, ri.keyspace, cf, range).then([&ri, cf, range, &neighbors] (uint64_t estimated_partitions) {
+    range_splitter ranges(range, estimated_partitions, ri.target_partitions);
+    logger.info("Repair {} out of {} ranges, id={}, keyspace={}, cf={}, range={}, target_partitions={}, estimated_partitions={}",
+            ri.ranges_index, ri.ranges.size(), ri.id, ri.keyspace, cf, range, ri.target_partitions, estimated_partitions);
    return do_with(seastar::gate(), true, std::move(cf), std::move(ranges),
        [&ri, &neighbors] (auto& completion, auto& success, const auto& cf, auto& ranges) {
-        return do_for_each(ranges, [&ri, &completion, &success, &neighbors, &cf] (const auto& range) {
+        return do_until([&ranges] () { return !ranges.has_next(); },
+            [&ranges, &ri, &completion, &success, &neighbors, &cf] () {
+            auto range = ranges.next();
            check_in_shutdown();
-            return parallelism_semaphore.wait(1).then([&ri, &completion, &success, &neighbors, &cf, &range] {
+            return parallelism_semaphore.wait(1).then([&ri, &completion, &success, &neighbors, &cf, range] {
                auto checksum_type = service::get_local_storage_service().cluster_supports_large_partitions()
                                     ? repair_checksum::streamed : repair_checksum::legacy;

@@ -575,7 +596,7 @@ static future<> repair_cf_range(repair_info& ri,

                completion.enter();
                when_all(checksums.begin(), checksums.end()).then(
-                        [&ri, &cf, &range, &neighbors, &success]
+                        [&ri, &cf, range, &neighbors, &success]
                        (std::vector<future<partition_checksum>> checksums) {
                    // If only some of the replicas of this range are alive,
                    // we set success=false so repair will fail, but we can
@@ -694,13 +715,12 @@ static future<> repair_cf_range(repair_info& ri,
                        }
                    }
                    if (!(live_neighbors_in.empty() && live_neighbors_out.empty())) {
-                        logger.info("Found differing range {} on nodes {}, in = {}, out = {}", range,
+                        logger.debug("Found differing range {} on nodes {}, in = {}, out = {}", range,
                                live_neighbors, live_neighbors_in, live_neighbors_out);
-                        ri.request_transfer_ranges(cf, range, live_neighbors_in, live_neighbors_out);
-                        return make_ready_future<>();
+                        return ri.request_transfer_ranges(cf, range, live_neighbors_in, live_neighbors_out);
                    }
                    return make_ready_future<>();
-                }).handle_exception([&ri, &success, &cf, &range] (std::exception_ptr eptr) {
+                }).handle_exception([&ri, &success, &cf, range] (std::exception_ptr eptr) {
                    // Something above (e.g., request_transfer_ranges) failed. We could
                    // stop the repair immediately, or let it continue with
                    // other ranges (at the moment, we do the latter). But in
@@ -727,6 +747,7 @@ static future<> repair_cf_range(repair_info& ri,
            });
        });
    });
+    });
 }

 // Repair a single local range, multiple column families.
@@ -945,7 +966,7 @@ private:
 // same nodes as replicas.
 static future<> repair_ranges(repair_info ri) {
    return do_with(std::move(ri), [] (auto& ri) {
-    #if 1
+    #if 0
        // repair all the ranges in parallel
        return parallel_for_each(ri.ranges, [&ri] (auto&& range) {
    #else
@@ -955,6 +976,8 @@ static future<> repair_ranges(repair_info ri) {
            check_in_shutdown();
            return repair_range(ri, range);
        }).then([&ri] {
+            // Do streaming for the remaining ranges we do not stream in
+            // repair_cf_range
            return ri.do_streaming();
        }).then([&ri] {
            repair_tracker.done(ri.id, ri.check_failed_ranges());
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -1023,12 +1023,13 @@ future<streamed_mutation_opt> cache_entry::read_wide(row_cache& rc, schema_ptr s
                : _range(std::move(pr))
                , _reader(rc._underlying(s, _range, slice, pc))
        { }
+        range_and_underlyig_reader(range_and_underlyig_reader&&) = delete;
    };
    rc._tracker.on_uncached_wide_partition();
    auto pr = dht::partition_range::make_singular(_key);
-    return do_with(range_and_underlyig_reader(rc, s, std::move(pr), slice, pc), [] (auto& r_a_ur) {
-        return r_a_ur._reader();
-    });
+    auto rd_ptr = std::make_unique<range_and_underlyig_reader>(rc, s, std::move(pr), slice, pc);
+    auto& r_a_ur = *rd_ptr;
+    return r_a_ur._reader().finally([rd_ptr = std::move(rd_ptr)] {});
 }

 streamed_mutation cache_entry::read(row_cache& rc, const schema_ptr& s) {
--- a/schema.cc
+++ b/schema.cc
@@ -145,6 +145,20 @@ void schema::rebuild() {

    thrift()._compound = is_compound();
    thrift()._is_dynamic = clustering_key_size() > 0;
+
+    if (default_validator()->is_counter()) {
+        for (auto&& cdef : boost::range::join(static_columns(), regular_columns())) {
+            if (!cdef.type->is_counter()) {
+                throw exceptions::configuration_exception(sprint("Cannot add a non counter column (%s) in a counter column family", cdef.name_as_text()));
+            }
+        }
+    } else {
+        for (auto&& cdef : all_columns()) {
+            if (cdef.second->type->is_counter()) {
+                throw exceptions::configuration_exception(sprint("Cannot add a counter column (%s) in a non counter column family", cdef.second->name_as_text()));
+            }
+        }
+    }
 }

 const column_mapping& schema::get_column_mapping() const {
--- a/26
+++ b/26
@@ -31,6 +31,8 @@ import os
 import sys
 import subprocess
 import uuid
+import re
+import glob
 from pkg_resources import parse_version

 VERSION = "1.0"
@@ -69,6 +71,20 @@ def create_uuid_file(fl):
    with open(args.uuid_file, 'w') as myfile:
        myfile.write(str(uuid.uuid1()) + "\n")

+def get_repo_file(dir):
+    files = glob.glob(dir)
+    files.sort(key=os.path.getmtime, reverse=True)
+    for name in files:
+        with open(name, 'r') as myfile:
+            for line in myfile:
+                match = re.search(".*http.?://.*/scylladb/([^/\s]+)/deb/([^/\s]+)\s.*", line)
+                if match:
+                    return match.group(2), match.group(1)
+                match = re.search(".*http.?://.*/scylladb/([^/]+)/rpm/[^/]+/([^/\s]+)/.*", line)
+                if match:
+                    return match.group(2), match.group(1)
+    return None, None
+
 def check_version(ar):
    if config and (not config.has_option("housekeeping", "check-version") or not config.getboolean("housekeeping", "check-version")):
        return
@@ -87,6 +103,10 @@ def check_version(ar):
            params = params + "&sts=" + ar.mode
        if uid:
            params = params + "&uu=" + uid
+        if repo_id:
+            params = params + "&rid=" + repo_id
+        if repo_type:
+            params = params + "&rtype=" + repo_type
        latest_version = get_json_from_url(version_url + params)["version"]
    except:
        traceln("Unable to retrieve version information")
@@ -99,6 +119,7 @@ parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Q
 parser.add_argument('-c', '--config', default="", help='An optional config file. Specifying a missing file will terminate the script')
 parser.add_argument('--uuid', default="", help='A uuid for the requests')
 parser.add_argument('--uuid-file', default="", help='A uuid file for the requests')
+parser.add_argument('--repo-files', default="", help='The repository files that is been used for private repositories')

 subparsers = parser.add_subparsers(help='Available commands')
 parser_help = subparsers.add_parser('help', help='Display help information')
@@ -111,6 +132,9 @@ parser_system.set_defaults(func=check_version)
 args = parser.parse_args()
 quiet = args.quiet
 config = None
+repo_id = None
+repo_type = None
+
 if args.config != "":
    if not os.path.isfile(args.config):
        traceln("Config file ", args.config, " is missing, terminating")
@@ -125,4 +149,6 @@ if args.uuid_file != "":
        create_uuid_file(args.uuid_file)
    with open(args.uuid_file, 'r') as myfile:
        uid = myfile.read().replace('\n', '')
+if args.repo_files != "":
+    repo_type, repo_id = get_repo_file(args.repo_files)
 args.func(args)
--- a/2
+++ b/2
--- a/serialization_visitors.hh
+++ b/serialization_visitors.hh
@@ -115,4 +115,42 @@ inline frame<seastar::measuring_output_stream> start_frame(seastar::measuring_ou
    return { };
 }

+template<>
+class place_holder<seastar::simple_output_stream> {
+    seastar::simple_output_stream _substream;
+public:
+    place_holder(seastar::simple_output_stream substream)
+        : _substream(substream) { }
+
+    void set(seastar::simple_output_stream& out, size_type v) {
+        serialize(_substream, v);
+    }
+};
+
+template<>
+class frame<seastar::simple_output_stream> : public place_holder<seastar::simple_output_stream> {
+    char* _start;
+public:
+    frame(seastar::simple_output_stream ph, char* start)
+        : place_holder(ph), _start(start) { }
+
+    void end(seastar::simple_output_stream& out) {
+        set(out, out.begin() - _start);
+    }
+};
+
+inline place_holder<seastar::simple_output_stream> start_place_holder(seastar::simple_output_stream& out) {
+    return { out.write_substream(sizeof(size_type)) };
+}
+
+inline frame<seastar::simple_output_stream> start_frame(seastar::simple_output_stream& out) {
+    auto start = out.begin();
+    auto substream = out.write_substream(sizeof(size_type));
+    {
+        auto sstr = substream;
+        serialize(sstr, size_type(0));
+    }
+    return frame<seastar::simple_output_stream>(substream, start);
+}
+
 }
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -481,8 +481,10 @@ future<> migration_manager::announce_new_column_family(schema_ptr cfm, bool anno
            throw exceptions::already_exists_exception(cfm->ks_name(), cfm->cf_name());
        }
        logger.info("Create new ColumnFamily: {}", cfm);
-        auto mutations = db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, api::new_timestamp())
+            .then([announce_locally, this] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_keyspace& e) {
        throw exceptions::configuration_exception(sprint("Cannot add table '%s' to non existing keyspace '%s'.", cfm->cf_name(), cfm->ks_name()));
    }
@@ -501,8 +503,10 @@ future<> migration_manager::announce_column_family_update(schema_ptr cfm, bool f
 #endif
        logger.info("Update table '{}.{}' From {} To {}", cfm->ks_name(), cfm->cf_name(), *old_schema, *cfm);
        auto&& keyspace = db.find_keyspace(cfm->ks_name());
-        auto mutations = db::schema_tables::make_update_table_mutations(keyspace.metadata(), old_schema, cfm, api::new_timestamp(), from_thrift);
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_update_table_mutations(keyspace.metadata(), old_schema, cfm, api::new_timestamp(), from_thrift)
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_column_family& e) {
        throw exceptions::configuration_exception(sprint("Cannot update non existing table '%s' in keyspace '%s'.",
                                                         cfm->cf_name(), cfm->ks_name()));
@@ -512,8 +516,10 @@ future<> migration_manager::announce_column_family_update(schema_ptr cfm, bool f
 static future<> do_announce_new_type(user_type new_type, bool announce_locally) {
    auto& db = get_local_storage_proxy().get_db().local();
    auto&& keyspace = db.find_keyspace(new_type->_keyspace);
-    auto mutations = db::schema_tables::make_create_type_mutations(keyspace.metadata(), new_type, api::new_timestamp());
-    return migration_manager::announce(std::move(mutations), announce_locally);
+    return db::schema_tables::make_create_type_mutations(keyspace.metadata(), new_type, api::new_timestamp())
+        .then([announce_locally] (auto&& mutations) {
+            return migration_manager::announce(std::move(mutations), announce_locally);
+        });
 }

 future<> migration_manager::announce_new_type(user_type new_type, bool announce_locally) {
@@ -609,8 +615,10 @@ future<> migration_manager::announce_column_family_drop(const sstring& ks_name,
                        ks_name, ::join(", ", views | boost::adaptors::transformed([](auto&& v) { return v->cf_name(); }))));
        }
        logger.info("Drop table '{}.{}'", schema->ks_name(), schema->cf_name());
-        auto mutations = db::schema_tables::make_drop_table_mutations(db.find_keyspace(ks_name).metadata(), schema, api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_drop_table_mutations(db.find_keyspace(ks_name).metadata(), schema, api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_column_family& e) {
        throw exceptions::configuration_exception(sprint("Cannot drop non existing table '%s' in keyspace '%s'.", cf_name, ks_name));
    }
@@ -621,8 +629,10 @@ future<> migration_manager::announce_type_drop(user_type dropped_type, bool anno
    auto& db = get_local_storage_proxy().get_db().local();
    auto&& keyspace = db.find_keyspace(dropped_type->_keyspace);
    logger.info("Drop User Type: {}", dropped_type->get_name_as_string());
-    auto mutations = db::schema_tables::make_drop_type_mutations(keyspace.metadata(), dropped_type, api::new_timestamp());
-    return announce(std::move(mutations), announce_locally);
+    return db::schema_tables::make_drop_type_mutations(keyspace.metadata(), dropped_type, api::new_timestamp())
+        .then([announce_locally] (auto&& mutations) {
+            return announce(std::move(mutations), announce_locally);
+        });
 }

 future<> migration_manager::announce_new_view(view_ptr view, bool announce_locally)
@@ -637,8 +647,10 @@ future<> migration_manager::announce_new_view(view_ptr view, bool announce_local
            throw exceptions::already_exists_exception(view->ks_name(), view->cf_name());
        }
        logger.info("Create new view: {}", view);
-        auto mutations = db::schema_tables::make_create_view_mutations(keyspace, std::move(view), api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_create_view_mutations(keyspace, std::move(view), api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_keyspace& e) {
        throw exceptions::configuration_exception(sprint("Cannot add view '%s' to non existing keyspace '%s'.", view->cf_name(), view->ks_name()));
    }
@@ -660,8 +672,10 @@ future<> migration_manager::announce_view_update(view_ptr view, bool announce_lo
        oldCfm.validateCompatility(cfm);
 #endif
        logger.info("Update view '{}.{}' From {} To {}", view->ks_name(), view->cf_name(), *old_view, *view);
-        auto mutations = db::schema_tables::make_update_view_mutations(std::move(keyspace), view_ptr(old_view), std::move(view), api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_update_view_mutations(std::move(keyspace), view_ptr(old_view), std::move(view), api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const std::out_of_range& e) {
        throw exceptions::configuration_exception(sprint("Cannot update non existing materialized view '%s' in keyspace '%s'.",
                                                         view->cf_name(), view->ks_name()));
@@ -680,8 +694,10 @@ future<> migration_manager::announce_view_drop(const sstring& ks_name,
        }
        auto keyspace = db.find_keyspace(ks_name).metadata();
        logger.info("Drop view '{}.{}'", view->ks_name(), view->cf_name());
-        auto mutations = db::schema_tables::make_drop_view_mutations(std::move(keyspace), view_ptr(std::move(view)), api::new_timestamp());
-        return announce(std::move(mutations), announce_locally);
+        return db::schema_tables::make_drop_view_mutations(std::move(keyspace), view_ptr(std::move(view)), api::new_timestamp())
+            .then([announce_locally] (auto&& mutations) {
+                return announce(std::move(mutations), announce_locally);
+            });
    } catch (const no_such_column_family& e) {
        throw exceptions::configuration_exception(sprint("Cannot drop non existing materialized view '%s' in keyspace '%s'.",
                                                         cf_name, ks_name));
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -478,7 +478,6 @@ inline uint64_t& storage_proxy::split_stats::get_ep_stat(gms::inet_address ep) {
 storage_proxy::~storage_proxy() {}
 storage_proxy::storage_proxy(distributed<database>& db) : _db(db) {
    namespace sm = seastar::metrics;
-
    _metrics.add_group(COORDINATOR_STATS_CATEGORY, {
        sm::make_queue_length("foreground_writes", [this] { return _stats.writes - _stats.background_writes; },
                       sm::description("number of currently pending foreground write requests")),
@@ -486,7 +485,7 @@ storage_proxy::storage_proxy(distributed<database>& db) : _db(db) {
        sm::make_queue_length("background_writes", [this] { return _stats.background_writes; },
                       sm::description("number of currently pending background write requests")),

-        sm::make_queue_length("throttled_writes", [this] { return _throttled_writes.size(); },
+        sm::make_queue_length("current_throttled_writes", [this] { return _throttled_writes.size(); },
                       sm::description("number of currently throttled write requests")),

        sm::make_total_operations("throttled_writes", [this] { return _stats.throttled_writes; },
@@ -1733,14 +1732,14 @@ protected:
    size_t _targets_count;
    promise<> _done_promise; // all target responded
    bool _timedout = false; // will be true if request timeouts
-    timer<lowres_clock> _timeout;
+    timer<storage_proxy::clock_type> _timeout;
    size_t _responses = 0;
    schema_ptr _schema;

    virtual void on_timeout() {}
    virtual size_t response_count() const = 0;
 public:
-    abstract_read_resolver(schema_ptr schema, db::consistency_level cl, size_t target_count, lowres_clock::time_point timeout)
+    abstract_read_resolver(schema_ptr schema, db::consistency_level cl, size_t target_count, storage_proxy::clock_type::time_point timeout)
        : _cl(cl)
        , _targets_count(target_count)
        , _schema(std::move(schema))
@@ -1796,7 +1795,7 @@ class digest_read_resolver : public abstract_read_resolver {
        return _digest_results.size();
    }
 public:
-    digest_read_resolver(schema_ptr schema, db::consistency_level cl, size_t block_for, lowres_clock::time_point timeout) : abstract_read_resolver(std::move(schema), cl, 0, timeout), _block_for(block_for) {}
+    digest_read_resolver(schema_ptr schema, db::consistency_level cl, size_t block_for, storage_proxy::clock_type::time_point timeout) : abstract_read_resolver(std::move(schema), cl, 0, timeout), _block_for(block_for) {}
    void add_data(gms::inet_address from, foreign_ptr<lw_shared_ptr<query::result>> result) {
        if (!_timedout) {
            // if only one target was queried digest_check() will be skipped so we can also skip digest calculation
@@ -2143,7 +2142,7 @@ private:
        return false;
    }
 public:
-    data_read_resolver(schema_ptr schema, db::consistency_level cl, size_t targets_count, lowres_clock::time_point timeout) : abstract_read_resolver(std::move(schema), cl, targets_count, timeout) {
+    data_read_resolver(schema_ptr schema, db::consistency_level cl, size_t targets_count, storage_proxy::clock_type::time_point timeout) : abstract_read_resolver(std::move(schema), cl, targets_count, timeout) {
        _data_results.reserve(targets_count);
    }
    void add_mutate_data(gms::inet_address from, foreign_ptr<lw_shared_ptr<reconcilable_result>> result) {
@@ -2330,7 +2329,7 @@ protected:
    using targets_iterator = std::vector<gms::inet_address>::iterator;
    using digest_resolver_ptr = ::shared_ptr<digest_read_resolver>;
    using data_resolver_ptr = ::shared_ptr<data_read_resolver>;
-    using clock_type = lowres_clock;
+    using clock_type = storage_proxy::clock_type;

    schema_ptr _schema;
    shared_ptr<storage_proxy> _proxy;
@@ -2454,7 +2453,7 @@ protected:
    uint32_t original_partition_limit() const {
        return _cmd->partition_limit;
    }
-    void reconcile(db::consistency_level cl, lowres_clock::time_point timeout, lw_shared_ptr<query::read_command> cmd) {
+    void reconcile(db::consistency_level cl, storage_proxy::clock_type::time_point timeout, lw_shared_ptr<query::read_command> cmd) {
        data_resolver_ptr data_resolver = ::make_shared<data_read_resolver>(_schema, cl, _targets.size(), timeout);
        auto exec = shared_from_this();

@@ -2529,12 +2528,12 @@ protected:
            }
        });
    }
-    void reconcile(db::consistency_level cl, lowres_clock::time_point timeout) {
+    void reconcile(db::consistency_level cl, storage_proxy::clock_type::time_point timeout) {
        reconcile(cl, timeout, _cmd);
    }

 public:
-    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(lowres_clock::time_point timeout) {
+    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) {
        digest_resolver_ptr digest_resolver = ::make_shared<digest_read_resolver>(_schema, _cl, _block_for, timeout);
        auto exec = shared_from_this();

@@ -2604,7 +2603,7 @@ public:
 class always_speculating_read_executor : public abstract_read_executor {
 public:
    using abstract_read_executor::abstract_read_executor;
-    virtual future<> make_requests(digest_resolver_ptr resolver, lowres_clock::time_point timeout) {
+    virtual future<> make_requests(digest_resolver_ptr resolver, storage_proxy::clock_type::time_point timeout) {
        resolver->add_wait_targets(_targets.size());
        // FIXME: consider disabling for CL=*ONE
        bool want_digest = true;
@@ -2615,10 +2614,10 @@ public:

 // this executor sends request to an additional replica after some time below timeout
 class speculating_read_executor : public abstract_read_executor {
-    timer<> _speculate_timer;
+    timer<storage_proxy::clock_type> _speculate_timer;
 public:
    using abstract_read_executor::abstract_read_executor;
-    virtual future<> make_requests(digest_resolver_ptr resolver, lowres_clock::time_point timeout) {
+    virtual future<> make_requests(digest_resolver_ptr resolver, storage_proxy::clock_type::time_point timeout) {
        _speculate_timer.set_callback([this, resolver, timeout] {
            if (!resolver->is_completed()) { // at the time the callback runs request may be completed already
                resolver->add_wait_targets(1); // we send one more request so wait for it too
@@ -2664,7 +2663,7 @@ class range_slice_read_executor : public abstract_read_executor {
 public:
    range_slice_read_executor(schema_ptr s, shared_ptr<storage_proxy> proxy, lw_shared_ptr<query::read_command> cmd, dht::partition_range pr, db::consistency_level cl, std::vector<gms::inet_address> targets, tracing::trace_state_ptr trace_state) :
                                    abstract_read_executor(std::move(s), std::move(proxy), std::move(cmd), std::move(pr), cl, targets.size(), std::move(targets), std::move(trace_state)) {}
-    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(lowres_clock::time_point timeout) override {
+    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) override {
        reconcile(_cl, timeout);
        return _result_promise.get_future();
    }
@@ -2795,7 +2794,7 @@ future<foreign_ptr<lw_shared_ptr<query::result>>>
 storage_proxy::query_singular(lw_shared_ptr<query::read_command> cmd, dht::partition_range_vector&& partition_ranges, db::consistency_level cl, tracing::trace_state_ptr trace_state) {
    std::vector<::shared_ptr<abstract_read_executor>> exec;
    exec.reserve(partition_ranges.size());
-    auto timeout = lowres_clock::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());
+    auto timeout = storage_proxy::clock_type::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());

    for (auto&& pr: partition_ranges) {
        if (!pr.is_singular()) {
@@ -2819,7 +2818,7 @@ storage_proxy::query_singular(lw_shared_ptr<query::read_command> cmd, dht::parti
 }

 future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>>
-storage_proxy::query_partition_key_range_concurrent(lowres_clock::time_point timeout, std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results,
+storage_proxy::query_partition_key_range_concurrent(storage_proxy::clock_type::time_point timeout, std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results,
        lw_shared_ptr<query::read_command> cmd, db::consistency_level cl, dht::partition_range_vector::iterator&& i,
        dht::partition_range_vector&& ranges, int concurrency_factor, tracing::trace_state_ptr trace_state,
        uint32_t remaining_row_count, uint32_t remaining_partition_count) {
@@ -2923,7 +2922,7 @@ storage_proxy::query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
    schema_ptr schema = local_schema_registry().get(cmd->schema_version);
    keyspace& ks = _db.local().find_keyspace(schema->ks_name());
    dht::partition_range_vector ranges;
-    auto timeout = lowres_clock::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());
+    auto timeout = storage_proxy::clock_type::now() + std::chrono::milliseconds(_db.local().get_config().read_request_timeout_in_ms());

    // when dealing with LocalStrategy keyspaces, we can skip the range splitting and merging (which can be
    // expensive in clusters with vnodes)
@@ -3957,24 +3956,22 @@ storage_proxy::query_nonsingular_mutations_locally(schema_ptr s, lw_shared_ptr<q
    auto shard_cmd = make_lw_shared<query::read_command>(*cmd);
    return do_with(cmd,
            shard_cmd,
-            1u,
            0u,
            false,
            static_cast<unsigned>(prs.size()),
            std::unordered_map<element_and_shard, partition_range_and_sort_key>{},
            mutation_result_merger{s, cmd},
-            dht::ring_position_range_vector_sharder{prs},
+            dht::ring_position_exponential_vector_sharder{prs},
            global_schema_ptr(s),
            tracing::global_trace_state_ptr(std::move(trace_state)),
            [this, s, max_size] (lw_shared_ptr<query::read_command>& cmd,
                    lw_shared_ptr<query::read_command>& shard_cmd,
-                    unsigned& shards_in_parallel,
                    unsigned& mutation_result_merger_key,
                    bool& no_more_ranges,
                    unsigned& partition_range_count,
                    std::unordered_map<element_and_shard, partition_range_and_sort_key>& shards_for_this_iteration,
                    mutation_result_merger& mrm,
-                    dht::ring_position_range_vector_sharder& rprs,
+                    dht::ring_position_exponential_vector_sharder& rpevs,
                    global_schema_ptr& gs,
                    tracing::global_trace_state_ptr& gt) {
      return _db.local().get_result_memory_limiter().new_mutation_read(max_size).then([&, s] (query::result_memory_accounter ma) {
@@ -3985,36 +3982,32 @@ storage_proxy::query_nonsingular_mutations_locally(schema_ptr s, lw_shared_ptr<q
            // because we'll throw away most of the results.  So we'll exponentially increase
            // concurrency starting at 1, so we won't waste on dense tables and at most
            // `log(nr_shards) + ignore_msb_bits` latency multiplier for near-empty tables.
+            //
+            // We use the ring_position_exponential_vector_sharder to give us subranges that follow
+            // this scheme.
            shards_for_this_iteration.clear();
            // If we're reading from less than smp::count shards, then we can just append
            // each shard in order without sorting.  If we're reading from more, then
            // we'll read from some shards at least twice, so the partitions within will be
            // out-of-order wrt. other shards
+            auto this_iteration_subranges = rpevs.next(*s);
            auto retain_shard_order = true;
-            for (auto i = 0u; i < shards_in_parallel; ++i) {
-                auto now = rprs.next(*s);
-                if (!now) {
-                    no_more_ranges = true;
-                    break;
-                }
-                // Let's see if this is a new shard, or if we can expand an existing range
-                auto&& rng_ok = shards_for_this_iteration.emplace(element_and_shard{now->element, now->shard}, partition_range_and_sort_key{now->ring_range, i});
-                if (!rng_ok.second) {
-                    // We saw this shard already, enlarge the range (we know now->ring_range came from the same partition range;
-                    // otherwise it would have had a unique now->element).
-                    auto& rng = rng_ok.first->second.pr;
-                    rng = nonwrapping_range<dht::ring_position>(std::move(rng.start()), std::move(now->ring_range.end()));
-                    // This range is no longer ordered with respect to the others, so:
-                    retain_shard_order = false;
+            no_more_ranges = true;
+            if (this_iteration_subranges) {
+                no_more_ranges = false;
+                retain_shard_order = this_iteration_subranges->inorder;
+                auto sort_key = 0u;
+                for (auto&& now : this_iteration_subranges->per_shard_ranges) {
+                    shards_for_this_iteration.emplace(element_and_shard{this_iteration_subranges->element, now.shard}, partition_range_and_sort_key{now.ring_range, sort_key++});
                }
            }
+
            auto key_base = mutation_result_merger_key;

            // prepare for next iteration
            // Each iteration uses a merger key that is either i in the loop above (so in the range [0, shards_in_parallel),
            // or, the element index in prs (so in the range [0, partition_range_count).  Make room for sufficient keys.
-            mutation_result_merger_key += std::max(shards_in_parallel, partition_range_count);
-            shards_in_parallel *= 2;
+            mutation_result_merger_key += std::max(smp::count, partition_range_count);

            shard_cmd->partition_limit = cmd->partition_limit - mrm.partition_count();
            shard_cmd->row_limit = cmd->row_limit - mrm.row_count();
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -71,7 +71,7 @@ public:
 private:
    struct rh_entry {
        ::shared_ptr<abstract_write_response_handler> handler;
-        timer<lowres_clock> expire_timer;
+        timer<clock_type> expire_timer;
        rh_entry(::shared_ptr<abstract_write_response_handler>&& h, std::function<void()>&& cb);
    };

@@ -253,7 +253,7 @@ private:
    dht::partition_range_vector get_restricted_ranges(keyspace& ks, const schema& s, dht::partition_range range);
    float estimate_result_rows_per_range(lw_shared_ptr<query::read_command> cmd, keyspace& ks);
    static std::vector<gms::inet_address> intersection(const std::vector<gms::inet_address>& l1, const std::vector<gms::inet_address>& l2);
-    future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>> query_partition_key_range_concurrent(lowres_clock::time_point timeout,
+    future<std::vector<foreign_ptr<lw_shared_ptr<query::result>>>> query_partition_key_range_concurrent(clock_type::time_point timeout,
            std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results, lw_shared_ptr<query::read_command> cmd, db::consistency_level cl, dht::partition_range_vector::iterator&& i,
            dht::partition_range_vector&& ranges, int concurrency_factor, tracing::trace_state_ptr trace_state,
            uint32_t remaining_row_count, uint32_t remaining_partition_count);
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -47,6 +47,7 @@
 #include <boost/range/algorithm.hpp>
 #include <boost/range/adaptors.hpp>
 #include <boost/range/join.hpp>
+#include <boost/algorithm/cxx11/any_of.hpp>

 #include "core/future-util.hh"
 #include "core/pipe.hh"
@@ -382,11 +383,22 @@ get_fully_expired_sstables(column_family& cf, std::vector<sstables::shared_sstab
        }
    }

+    auto compacted_undeleted_gens = boost::copy_range<std::unordered_set<int64_t>>(cf.compacted_undeleted_sstables()
+        | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::generation)));
+    auto has_undeleted_ancestor = [&compacted_undeleted_gens] (auto& candidate) {
+        return boost::algorithm::any_of(candidate->ancestors(), [&compacted_undeleted_gens] (auto gen) {
+            return compacted_undeleted_gens.count(gen);
+        });
+    };
+
    // SStables that do not contain live data is added to list of possibly expired sstables.
    for (auto& candidate : compacting) {
        logger.debug("Checking if candidate of generation {} and max_deletion_time {} is expired, gc_before is {}",
                    candidate->generation(), candidate->get_stats_metadata().max_local_deletion_time, gc_before);
-        if (candidate->get_stats_metadata().max_local_deletion_time < gc_before) {
+        // A fully expired sstable which has an ancestor undeleted shouldn't be compacted because
+        // expired data won't be purged because undeleted sstables are taken into account when
+        // calculating max purgeable timestamp, and not doing it could lead to a compaction loop.
+        if (candidate->get_stats_metadata().max_local_deletion_time < gc_before && !has_undeleted_ancestor(candidate)) {
            logger.debug("Adding candidate of generation {} to list of possibly expired sstables", candidate->generation());
            candidates.push_back(candidate);
        } else {
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -242,11 +242,12 @@ void compaction_manager::submit_sstable_rewrite(column_family* cf, sstables::sha
    // sstable we are planning to work on:
    _compacting_sstables.insert(sst);
    auto task = make_lw_shared<compaction_manager::task>();
+    task->compacting_cf = cf;
    _tasks.push_back(task);
-    task->compaction_done = with_semaphore(sem, 1, [this, cf, sst] {
+    task->compaction_done = with_semaphore(sem, 1, [this, task, cf, sst] {
        _stats.active_tasks++;
-        if (_stopped) {
-            return make_ready_future<>();;
+        if (!can_proceed(task)) {
+            return make_ready_future<>();
        }
        return cf->compact_sstables(sstables::compaction_descriptor(
                std::vector<sstables::shared_sstable>{sst},
@@ -462,6 +463,14 @@ future<> compaction_manager::perform_cleanup(column_family* cf) {
 }

 future<> compaction_manager::remove(column_family* cf) {
+    // FIXME: better way to iterate through compaction info for a given column family,
+    // although this path isn't performance sensitive.
+    for (auto& info : _compactions) {
+        if (cf->schema()->ks_name() == info->ks && cf->schema()->cf_name() == info->cf) {
+            info->stop("column family removal");
+        }
+    }
+
    // We need to guarantee that a task being stopped will not retry to compact
    // a column family being removed.
    auto tasks_to_stop = make_lw_shared<std::vector<lw_shared_ptr<task>>>();
--- a/sstables/compaction_strategy.cc
+++ b/sstables/compaction_strategy.cc
@@ -191,7 +191,8 @@ class partitioned_sstable_set : public sstable_set_impl {
    using map_iterator = interval_map_type::const_iterator;
 private:
    schema_ptr _schema;
-    interval_map_type _sstables;
+    std::vector<shared_sstable> _unleveled_sstables;
+    interval_map_type _leveled_sstables;
 private:
    static interval_type make_interval(const schema& s, const dht::partition_range& range) {
        return interval_type::closed(
@@ -207,16 +208,16 @@ private:
    }
    std::pair<map_iterator, map_iterator> query(const dht::partition_range& range) const {
        if (range.start() && range.end()) {
-            return _sstables.equal_range(make_interval(range));
+            return _leveled_sstables.equal_range(make_interval(range));
        }
        else if (range.start() && !range.end()) {
            auto start = singular(range.start()->value());
-            return { _sstables.lower_bound(start), _sstables.end() };
+            return { _leveled_sstables.lower_bound(start), _leveled_sstables.end() };
        } else if (!range.start() && range.end()) {
            auto end = singular(range.end()->value());
-            return { _sstables.begin(), _sstables.upper_bound(end) };
+            return { _leveled_sstables.begin(), _leveled_sstables.upper_bound(end) };
        } else {
-            return { _sstables.begin(), _sstables.end() };
+            return { _leveled_sstables.begin(), _leveled_sstables.end() };
        }
    }
 public:
@@ -234,29 +235,39 @@ public:
        while (b != e) {
            boost::copy(b++->second, std::inserter(result, result.end()));
        }
-        return std::vector<shared_sstable>(result.begin(), result.end());
+        auto r = _unleveled_sstables;
+        r.insert(r.end(), result.begin(), result.end());
+        return r;
    }
    virtual void insert(shared_sstable sst) override {
-        auto first = sst->get_first_decorated_key().token();
-        auto last = sst->get_last_decorated_key().token();
-        using bound = dht::partition_range::bound;
-        _sstables.add({
-                make_interval(
-                        dht::partition_range(
-                                bound(dht::ring_position::starting_at(first)),
-                                bound(dht::ring_position::ending_at(last)))),
-                value_set({sst})});
+        if (sst->get_sstable_level() == 0) {
+            _unleveled_sstables.push_back(std::move(sst));
+        } else {
+            auto first = sst->get_first_decorated_key().token();
+            auto last = sst->get_last_decorated_key().token();
+            using bound = dht::partition_range::bound;
+            _leveled_sstables.add({
+                    make_interval(
+                            dht::partition_range(
+                                    bound(dht::ring_position::starting_at(first)),
+                                    bound(dht::ring_position::ending_at(last)))),
+                    value_set({sst})});
+        }
    }
    virtual void erase(shared_sstable sst) override {
-        auto first = sst->get_first_decorated_key().token();
-        auto last = sst->get_last_decorated_key().token();
-        using bound = dht::partition_range::bound;
-        _sstables.subtract({
-                make_interval(
-                        dht::partition_range(
-                                bound(dht::ring_position::starting_at(first)),
-                                bound(dht::ring_position::ending_at(last)))),
-                value_set({sst})});
+        if (sst->get_sstable_level() == 0) {
+            _unleveled_sstables.erase(std::remove(_unleveled_sstables.begin(), _unleveled_sstables.end(), sst), _unleveled_sstables.end());
+        } else {
+            auto first = sst->get_first_decorated_key().token();
+            auto last = sst->get_last_decorated_key().token();
+            using bound = dht::partition_range::bound;
+            _leveled_sstables.subtract({
+                    make_interval(
+                            dht::partition_range(
+                                    bound(dht::ring_position::starting_at(first)),
+                                    bound(dht::ring_position::ending_at(last)))),
+                    value_set({sst})});
+        }
    }
    virtual std::unique_ptr<incremental_selector_impl> make_incremental_selector() const override;
    class incremental_selector;
@@ -264,6 +275,7 @@ public:

 class partitioned_sstable_set::incremental_selector : public incremental_selector_impl {
    schema_ptr _schema;
+    const std::vector<shared_sstable>& _unleveled_sstables;
    map_iterator _it;
    const map_iterator _end;
 private:
@@ -272,32 +284,35 @@ private:
            {i.upper().token(), boost::icl::is_right_closed(i.bounds())});
    }
 public:
-    incremental_selector(schema_ptr schema, const interval_map_type& sstables)
+    incremental_selector(schema_ptr schema, const std::vector<shared_sstable>& unleveled_sstables, const interval_map_type& leveled_sstables)
        : _schema(std::move(schema))
-        , _it(sstables.begin())
-        , _end(sstables.end()) {
+        , _unleveled_sstables(unleveled_sstables)
+        , _it(leveled_sstables.begin())
+        , _end(leveled_sstables.end()) {
    }
    virtual std::pair<dht::token_range, std::vector<shared_sstable>> select(const dht::token& token) override {
        auto pr = dht::partition_range::make(dht::ring_position::starting_at(token), dht::ring_position::ending_at(token));
        auto interval = make_interval(*_schema, std::move(pr));
+        auto ssts = _unleveled_sstables;

        while (_it != _end) {
            if (boost::icl::contains(_it->first, interval)) {
-                return std::make_pair(to_token_range(_it->first), std::vector<shared_sstable>(_it->second.begin(), _it->second.end()));
+                ssts.insert(ssts.end(), _it->second.begin(), _it->second.end());
+                return std::make_pair(to_token_range(_it->first), std::move(ssts));
            }
            // we don't want to skip current interval if token lies before it.
            if (boost::icl::lower_less(interval, _it->first)) {
                return std::make_pair(dht::token_range::make({token, true}, {_it->first.lower().token(), false}),
-                    std::vector<shared_sstable>());
+                    std::move(ssts));
            }
            _it++;
        }
-        return std::make_pair(dht::token_range::make_open_ended_both_sides(), std::vector<shared_sstable>());
+        return std::make_pair(dht::token_range::make_open_ended_both_sides(), std::move(ssts));
    }
 };

 std::unique_ptr<incremental_selector_impl> partitioned_sstable_set::make_incremental_selector() const {
-    return std::make_unique<incremental_selector>(_schema, _sstables);
+    return std::make_unique<incremental_selector>(_schema, _unleveled_sstables, _leveled_sstables);
 }

 class compaction_strategy_impl {
--- a/sstables/compress.cc
+++ b/sstables/compress.cc
@@ -71,6 +71,12 @@ void compression::set_compressor(compressor c) {
     }
 }

+// locate() takes a byte position in the uncompressed stream, and finds the
+// the location of the compressed chunk on disk which contains it, and the
+// offset in this chunk.
+// locate() may only be used for offsets of actual bytes, and in particular
+// the end-of-file position (one past the last byte) MUST not be used. If the
+// caller wants to read from the end of file, it should simply read nothing.
 compression::chunk_and_offset
 compression::locate(uint64_t position) const {
    auto ucl = uncompressed_chunk_length();
@@ -310,6 +316,9 @@ public:
    virtual future<temporary_buffer<char>> skip(uint64_t n) override {
        _pos += n;
        assert(_pos <= _end_pos);
+        if (_pos == _end_pos) {
+            return make_ready_future<temporary_buffer<char>>();
+        }
        auto addr = _compression_metadata->locate(_pos);
        auto underlying_n = addr.chunk_start - _underlying_pos;
        _underlying_pos = addr.chunk_start;
--- a/sstables/filter.cc
+++ b/sstables/filter.cc
@@ -44,13 +44,7 @@ future<> sstable::read_filter(const io_priority_class& pc) {
            large_bitset bs(filter.buckets.elements.size() * 64);
            bs.load(filter.buckets.elements.begin(), filter.buckets.elements.end());
            _components->filter = utils::filter::create_filter(filter.hashes, std::move(bs));
-        }).then([this] {
-            return io_check([&] {
-                return engine().file_size(this->filename(sstable::component_type::Filter));
-            });
        });
-    }).then([this] (auto size) {
-        _filter_file_size = size;
    });
 }

--- a/sstables/partition.cc
+++ b/sstables/partition.cc
@@ -880,10 +880,12 @@ static inline bytes_view consume_bytes(bytes_view& p, size_t len) {
    return ret;
 }

-static inline clustering_key_prefix get_clustering_key(
-        const schema& schema, bytes_view col_name) {
-    mp_row_consumer::column col(schema, std::move(col_name), api::max_timestamp);
-    return std::move(col.clustering);
+static inline clustering_key_prefix get_clustering_key(const schema& s, composite_view col_name) {
+    auto components = col_name.explode();
+    if (components.size() > s.clustering_key_size()) {
+        components.resize(s.clustering_key_size());
+    }
+    return clustering_key_prefix(std::move(components));
 }

 static bool has_static_columns(const schema& schema, index_entry &ie) {
@@ -955,9 +957,10 @@ sstables::sstable::find_disk_ranges(
                auto& range_start = ck_ranges.begin()->start();
                bool found_range_start = false;
                uint64_t range_start_pos;
+                uint64_t prev_pos = 0;
                auto& range_end = ck_ranges.begin()->end();

-                auto cmp = clustering_key_prefix::tri_compare(*schema);
+                auto cmp = clustering_key_prefix::prefix_equal_tri_compare(*schema);
                while (num_blocks--) {
                    if (data.size() < 2) {
                        // When we break out of this loop, we give up on
@@ -976,7 +979,7 @@ sstables::sstable::find_disk_ranges(
                    // But we only need to match the clustering key, because
                    // we got a clustering key range to search for.
                    auto start_ck = get_clustering_key(*schema,
-                            consume_bytes(data, len));
+                        composite_view(consume_bytes(data, len), schema->is_compound()));
                    if (data.size() < 2) {
                        break;
                    }
@@ -985,49 +988,50 @@ sstables::sstable::find_disk_ranges(
                        break;
                    }
                    auto end_ck = get_clustering_key(*schema,
-                            consume_bytes(data, len));
+                        composite_view(consume_bytes(data, len), schema->is_compound()));
                    if (data.size() < 16) {
                        break;
                    }
                    uint64_t offset = consume_be<uint64_t>(data);
                    uint64_t width = consume_be<uint64_t>(data);
-                    if (!found_range_start) {
-                        if (!range_start || cmp(range_start->value(), end_ck) <= 0) {
-                            range_start_pos = ie.position() + offset;
-                            found_range_start = true;
-                        }
-                    }
+                    uint64_t cur_pos = ie.position() + offset;
                    bool found_range_end = false;
                    uint64_t range_end_pos;
                    if (range_end) {
                        if (cmp(range_end->value(), start_ck) < 0) {
                            // this block is already past the range_end
                            found_range_end = true;
-                            range_end_pos = ie.position() + offset;
+                            range_end_pos = cur_pos;
                        } else if (cmp(range_end->value(), end_ck) < 0 || num_blocks == 0) {
                            // range_end is in the middle of this block.
                            // Note the strict inequality above is important:
                            // if range_end==end_ck the next block may contain
                            // still more items matching range_end.
                            found_range_end = true;
-                            range_end_pos = ie.position() + offset + width;
+                            range_end_pos = cur_pos + width;
                        }
                    } else if (num_blocks == 0) {
                        // When !range_end, read until the last block.
                        // In this case we could have also found the end of
                        // the partition using the index.
                        found_range_end = true;
-                        range_end_pos = ie.position() + offset + width;
+                        range_end_pos = cur_pos + width;
                    }
-                    if (found_range_end) {
-                        if (!found_range_start) {
-                            // return empty range
-                            range_start_pos = range_end_pos = 0;
+                    if (!found_range_start) {
+                        if (!range_start || cmp(range_start->value(), start_ck) <= 0) {
+                            range_start_pos = prev_pos ? prev_pos : cur_pos;
+                            found_range_start = true;
+                        } else if (found_range_end || num_blocks == 0) {
+                            range_start_pos = cur_pos;
+                            found_range_start = true;
                        }
+                    }
+                    if (found_range_end) { // found_range_end implies found_range_start
                        return make_ready_future<disk_read_range>(
                                disk_read_range(range_start_pos, range_end_pos,
                                        key, deltime));
                    }
+                    prev_pos = cur_pos;
                }
            }
            // Else, if more than one clustering-key range needs to be read,
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -100,35 +100,48 @@ future<> await_background_jobs_on_all_shards() {
 }

 class random_access_reader {
-    input_stream<char> _in;
+    std::unique_ptr<input_stream<char>> _in;
+    seastar::gate _close_gate;
 protected:
    virtual input_stream<char> open_at(uint64_t pos) = 0;
 public:
    future<temporary_buffer<char>> read_exactly(size_t n) {
-        return _in.read_exactly(n);
+        return _in->read_exactly(n);
    }
    void seek(uint64_t pos) {
-        _in = open_at(pos);
+        if (_in) {
+            seastar::with_gate(_close_gate, [in = std::move(_in)] () mutable {
+                auto fut = in->close();
+                return fut.then([in = std::move(in)] {});
+            });
+        }
+        _in = std::make_unique<input_stream<char>>(open_at(pos));
    }
-    bool eof() { return _in.eof(); }
+    bool eof() { return _in->eof(); }
    virtual future<> close() {
-        return _in.close();
+        return _close_gate.close().then([this] {
+            return _in->close();
+        });
    }
    virtual ~random_access_reader() { }
 };

 class file_random_access_reader : public random_access_reader {
    file _file;
+    uint64_t _file_size;
    size_t _buffer_size;
+    unsigned _read_ahead;
 public:
    virtual input_stream<char> open_at(uint64_t pos) override {
+        auto len = _file_size - pos;
        file_input_stream_options options;
        options.buffer_size = _buffer_size;
+        options.read_ahead = _read_ahead;

-        return make_file_input_stream(_file, pos, std::move(options));
+        return make_file_input_stream(_file, pos, len, std::move(options));
    }
-    explicit file_random_access_reader(file f, size_t buffer_size = 8192)
-        : _file(std::move(f)), _buffer_size(buffer_size)
+    explicit file_random_access_reader(file f, uint64_t file_size, size_t buffer_size = 8192, unsigned read_ahead = 4)
+        : _file(std::move(f)), _file_size(file_size), _buffer_size(buffer_size), _read_ahead(read_ahead)
    {
        seek(0);
    }
@@ -968,12 +981,15 @@ future<> sstable::read_simple(T& component, const io_priority_class& pc) {
    auto file_path = filename(Type);
    sstlog.debug(("Reading " + _component_map[Type] + " file {} ").c_str(), file_path);
    return open_file_dma(file_path, open_flags::ro).then([this, &component] (file fi) {
-        auto f = make_checked_file(_read_error_handler, fi);
-        auto r = make_lw_shared<file_random_access_reader>(std::move(f), sstable_buffer_size);
-        auto fut = parse(*r, component);
-        return fut.finally([r = std::move(r)] {
-            return r->close();
-        }).then([r] {});
+        auto fut = fi.size();
+        return fut.then([this, &component, fi = std::move(fi)] (uint64_t size) {
+            auto f = make_checked_file(_read_error_handler, fi);
+            auto r = make_lw_shared<file_random_access_reader>(std::move(f), size, sstable_buffer_size);
+            auto fut = parse(*r, component);
+            return fut.finally([r = std::move(r)] {
+                return r->close();
+            }).then([r] {});
+        });
    }).then_wrapped([this, file_path] (future<> f) {
        try {
            f.get();
@@ -1163,6 +1179,15 @@ future<> sstable::update_info_for_opened_data() {
        return _index_file.size().then([this] (auto size) {
            _index_file_size = size;
        });
+    }).then([this] {
+        if (this->has_component(sstable::component_type::Filter)) {
+            return io_check([&] {
+                return engine().file_size(this->filename(sstable::component_type::Filter));
+            }).then([this] (auto size) {
+                _filter_file_size = size;
+            });
+        }
+        return make_ready_future<>();
    }).then([this] {
        this->set_clustering_components_ranges();
        this->set_first_and_last_keys();
@@ -1199,19 +1224,16 @@ future<> sstable::create_data() {
 // No need to set tunable priorities for it.
 future<> sstable::load() {
    return read_toc().then([this] {
-        return read_statistics(default_priority_class());
-    }).then([this] {
-        validate_min_max_metadata();
-        set_clustering_components_ranges();
-        return read_compression(default_priority_class());
-    }).then([this] {
-        return read_scylla_metadata(default_priority_class());
-    }).then([this] {
-        return read_filter(default_priority_class());
-    }).then([this] {;
-        return read_summary(default_priority_class());
-    }).then([this] {
-        return open_data();
+        return seastar::when_all_succeed(
+                read_statistics(default_priority_class()),
+                read_compression(default_priority_class()),
+                read_scylla_metadata(default_priority_class()),
+                read_filter(default_priority_class()),
+                read_summary(default_priority_class())).then([this] {
+            validate_min_max_metadata();
+            set_clustering_components_ranges();
+            return open_data();
+        });
    });
 }

@@ -1690,24 +1712,21 @@ populate_statistics_offsets(statistics& s) {
 static
 sharding_metadata
 create_sharding_metadata(schema_ptr schema, const dht::decorated_key& first_key, const dht::decorated_key& last_key) {
-    auto range = dht::partition_range::make(dht::ring_position(first_key), dht::ring_position(last_key));
-    auto sharder = dht::ring_position_range_sharder(std::move(range));
+    auto prange = dht::partition_range::make(dht::ring_position(first_key), dht::ring_position(last_key));
    auto sm = sharding_metadata();
-    auto rpras = sharder.next(*schema);
-    while (rpras) {
-        if (rpras->shard == engine().cpu_id()) {
+    for (auto&& range : dht::split_range_to_single_shard(*schema, prange, engine().cpu_id())) {
+        if (true) { // keep indentation
            // we know left/right are not infinite
-            auto&& left = rpras->ring_range.start()->value();
-            auto&& right = rpras->ring_range.end()->value();
+            auto&& left = range.start()->value();
+            auto&& right = range.end()->value();
            auto&& left_token = left.token();
            auto left_exclusive = !left.has_key() && left.bound() == dht::ring_position::token_bound::end;
            auto&& right_token = right.token();
            auto right_exclusive = !right.has_key() && right.bound() == dht::ring_position::token_bound::start;
-            sm.token_ranges.elements.push_back({
+            sm.token_ranges.elements.push_back(disk_token_range{
                {left_exclusive, to_bytes(bytes_view(left_token._data))},
                {right_exclusive, to_bytes(bytes_view(right_token._data))}});
        }
-        rpras = sharder.next(*schema);
    }
    return sm;
 }
@@ -1951,19 +1970,20 @@ void sstable_writer::prepare_file_writer()
    options.write_behind = 10;

    if (!_compression_enabled) {
-        _writer = make_shared<checksummed_file_writer>(std::move(_sst._data_file), std::move(options), true);
+        _writer = std::make_unique<checksummed_file_writer>(std::move(_sst._data_file), std::move(options), true);
    } else {
        prepare_compression(_sst._components->compression, _schema);
-        _writer = make_shared<file_writer>(make_compressed_file_output_stream(std::move(_sst._data_file), std::move(options), &_sst._components->compression));
+        _writer = std::make_unique<file_writer>(make_compressed_file_output_stream(std::move(_sst._data_file), std::move(options), &_sst._components->compression));
    }
 }

 void sstable_writer::finish_file_writer()
 {
-    _writer->close().get();
+    auto writer = std::move(_writer);
+    writer->close().get();

    if (!_compression_enabled) {
-        auto chksum_wr = static_pointer_cast<checksummed_file_writer>(_writer);
+        auto chksum_wr = static_cast<checksummed_file_writer*>(writer.get());
        write_digest(_sst._write_error_handler, _sst.filename(sstable::component_type::Digest), chksum_wr->full_checksum());
        write_crc(_sst._write_error_handler, _sst.filename(sstable::component_type::CRC), chksum_wr->finalize_checksum());
    } else {
@@ -1971,6 +1991,16 @@ void sstable_writer::finish_file_writer()
    }
 }

+sstable_writer::~sstable_writer() {
+    if (_writer) {
+        try {
+            _writer->close().get();
+        } catch (...) {
+            sstlog.error("sstable_writer failed to close file: {}", std::current_exception());
+        }
+    }
+}
+
 sstable_writer::sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
                               uint64_t max_sstable_size, bool backup, bool leave_unsealed, const io_priority_class& pc)
    : _sst(sst)
@@ -2324,6 +2354,11 @@ double sstable::get_compression_ratio() const {
    }
 }

+std::unordered_set<uint64_t> sstable::ancestors() const {
+    const compaction_metadata& cm = get_compaction_metadata();
+    return boost::copy_range<std::unordered_set<uint64_t>>(cm.ancestors.elements);
+}
+
 void sstable::set_sstable_level(uint32_t new_level) {
    auto entry = _components->statistics.contents.find(metadata_type::Stats);
    if (entry == _components->statistics.contents.end()) {
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -325,6 +325,8 @@ public:
        _collector.add_ancestor(generation);
    }

+    std::unordered_set<uint64_t> ancestors() const;
+
    // Returns true iff this sstable contains data which belongs to many shards.
    bool is_shared() const {
        return _shared;
@@ -803,7 +805,7 @@ class sstable_writer {
    bool _backup;
    bool _leave_unsealed;
    bool _compression_enabled;
-    shared_ptr<file_writer> _writer;
+    std::unique_ptr<file_writer> _writer;
    stdx::optional<components_writer> _components_writer;
 private:
    void prepare_file_writer();
@@ -811,6 +813,10 @@ private:
 public:
    sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
                   uint64_t max_sstable_size, bool backup, bool leave_unsealed, const io_priority_class& pc);
+    ~sstable_writer();
+    sstable_writer(sstable_writer&& o) : _sst(o._sst), _schema(o._schema), _pc(o._pc), _backup(o._backup),
+            _leave_unsealed(o._leave_unsealed), _compression_enabled(o._compression_enabled), _writer(std::move(o._writer)),
+            _components_writer(std::move(o._components_writer)) {}
    void consume_new_partition(const dht::decorated_key& dk) { return _components_writer->consume_new_partition(dk); }
    void consume(tombstone t) { _components_writer->consume(t); }
    stop_iteration consume(static_row&& sr) { return _components_writer->consume(std::move(sr)); }
--- a/streaming/stream_session.hh
+++ b/streaming/stream_session.hh
@@ -175,10 +175,10 @@ private:
    stream_session_state _state = stream_session_state::INITIALIZED;
    bool _complete_sent = false;

-    // If the session is idle for 10 minutes, close the session
-    std::chrono::seconds _keep_alive_timeout{60 * 10};
-    // Check every 1 minutes
-    std::chrono::seconds _keep_alive_interval{60};
+    // If the session is idle for 300 minutes, close the session
+    std::chrono::seconds _keep_alive_timeout{60 * 300};
+    // Check every 10 minutes
+    std::chrono::seconds _keep_alive_interval{60 * 10};
    timer<lowres_clock> _keep_alive;
    stream_bytes _last_stream_bytes;
    lowres_clock::time_point _last_stream_progress;
--- a/test.py
+++ b/test.py
@@ -78,6 +78,7 @@ boost_tests = [
    'virtual_reader_test',
    'view_schema_test',
    'counter_test',
+    'cell_locker_test',
 ]

 other_tests = [
--- a/tests/canonical_mutation_test.cc
+++ b/tests/canonical_mutation_test.cc
@@ -55,13 +55,13 @@ SEASTAR_TEST_CASE(test_reading_with_different_schemas) {
            canonical_mutation cm1(m1);
            canonical_mutation cm2(m2);

-            {
+            if (can_upgrade_schema(m1.schema(), m2.schema())) {
                auto m = cm1.to_mutation(m1.schema());
                m.upgrade(m2.schema());
                assert_that(cm1.to_mutation(m2.schema())).is_equal_to(m);
            }

-            {
+            if (can_upgrade_schema(m2.schema(), m1.schema())) {
                auto m = cm2.to_mutation(m2.schema());
                m.upgrade(m1.schema());
                assert_that(cm2.to_mutation(m1.schema())).is_equal_to(m);
--- a/tests/cell_locker_test.cc
+++ b/tests/cell_locker_test.cc
@@ -0,0 +1,218 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "tests/test-utils.hh"
+#include "disk-error-handler.hh"
+
+#include <seastar/core/thread.hh>
+
+#include "cell_locking.hh"
+#include "mutation.hh"
+#include "schema_builder.hh"
+
+thread_local disk_error_signal_type commit_error;
+thread_local disk_error_signal_type general_disk_error;
+
+static schema_ptr make_schema()
+{
+    return schema_builder("ks", "cf")
+            .with_column("pk", bytes_type, column_kind::partition_key)
+            .with_column("ck", bytes_type, column_kind::clustering_key)
+            .with_column("s1", bytes_type, column_kind::static_column)
+            .with_column("s2", bytes_type, column_kind::static_column)
+            .with_column("s3", bytes_type, column_kind::static_column)
+            .with_column("r1", bytes_type)
+            .with_column("r2", bytes_type)
+            .with_column("r3", bytes_type)
+            .build();
+}
+
+static schema_ptr make_alternative_schema()
+{
+    return schema_builder("ks", "cf")
+            .with_column("pk", bytes_type, column_kind::partition_key)
+            .with_column("ck", bytes_type, column_kind::clustering_key)
+            .with_column("s0", bytes_type, column_kind::static_column)
+            .with_column("s1", bytes_type, column_kind::static_column)
+            .with_column("s2.5", bytes_type, column_kind::static_column)
+            .with_column("s3", bytes_type, column_kind::static_column)
+            .with_column("r0", bytes_type)
+            .with_column("r1", bytes_type)
+            .with_column("r2.5", bytes_type)
+            .with_column("r3", bytes_type)
+            .build();
+}
+
+static schema_ptr make_schema_disjoint_with_others()
+{
+    return schema_builder("ks", "cf")
+            .with_column("pk", bytes_type, column_kind::partition_key)
+            .with_column("ck", bytes_type, column_kind::clustering_key)
+            .with_column("s8", bytes_type, column_kind::static_column)
+            .with_column("s9", bytes_type, column_kind::static_column)
+            .with_column("r8", bytes_type)
+            .with_column("r9", bytes_type)
+            .build();
+}
+
+static data_value empty_value = data_value(to_bytes(""));
+
+static auto make_row(const sstring& key, std::initializer_list<sstring> cells) {
+    return std::pair<sstring, std::initializer_list<sstring>>(key, cells);
+}
+
+static mutation make_mutation(schema_ptr s, const sstring& pk, std::initializer_list<sstring> static_cells,
+                              std::initializer_list<std::pair<sstring, std::initializer_list<sstring>>> clustering_cells)
+{
+    auto m = mutation(partition_key::from_single_value(*s, to_bytes(pk)), s);
+    for (auto&& c : static_cells) {
+        m.set_static_cell(to_bytes(c), empty_value, api::new_timestamp());
+    }
+    for (auto&& r : clustering_cells) {
+        auto ck = clustering_key::from_single_value(*s, to_bytes(r.first));
+        for (auto&& c : r.second) {
+            m.set_clustered_cell(ck, to_bytes(c), empty_value, api::new_timestamp());
+        }
+    }
+    return m;
+}
+
+SEASTAR_TEST_CASE(test_simple_locking_cells) {
+    return seastar::async([&] {
+        auto destroy = [] (auto) { };
+
+        auto s = make_schema();
+        cell_locker cl(s);
+
+        auto m = make_mutation(s, "0", { "s1", "s3" }, {
+            make_row("one", { "r1", "r2" }),
+            make_row("two", { "r2", "r3" }),
+        });
+
+        auto l1 = cl.lock_cells(m.decorated_key(), partition_cells_range(m.partition())).get0();
+        auto f2 = cl.lock_cells(m.decorated_key(), partition_cells_range(m.partition()));
+        BOOST_REQUIRE(!f2.available());
+
+        destroy(std::move(l1));
+        destroy(f2.get0());
+    });
+}
+
+SEASTAR_TEST_CASE(test_disjoint_mutations) {
+    return seastar::async([&] {
+        auto s = make_schema();
+        cell_locker cl(s);
+
+        auto m1 = make_mutation(s, "0", { "s1" }, {
+                make_row("one", { "r1", "r2" }),
+                make_row("two", { "r3" }),
+        });
+        auto m2 = make_mutation(s, "0", { "s2" }, {
+                make_row("two", { "r1", "r2" }),
+                make_row("one", { "r3" }),
+        });
+
+        auto m3 = mutation(partition_key::from_single_value(*s, to_bytes("1")), s);
+        m3.partition() = m1.partition();
+
+        auto l1 = cl.lock_cells(m1.decorated_key(), partition_cells_range(m1.partition())).get0();
+        auto l2 = cl.lock_cells(m2.decorated_key(), partition_cells_range(m2.partition())).get0();
+        auto l3 = cl.lock_cells(m3.decorated_key(), partition_cells_range(m3.partition())).get0();
+    });
+}
+
+SEASTAR_TEST_CASE(test_single_cell_overlap) {
+    return seastar::async([&] {
+        auto destroy = [] (auto) { };
+
+        auto s = make_schema();
+        cell_locker cl(s);
+
+        auto m1 = make_mutation(s, "0", { "s1" }, {
+                make_row("one", { "r1", "r2" }),
+                make_row("two", { "r3" }),
+        });
+        auto m2 = make_mutation(s, "0", { "s1" }, {
+                make_row("two", { "r1", "r2" }),
+                make_row("one", { "r3" }),
+        });
+        auto m3 = make_mutation(s, "0", { "s2" }, {
+                make_row("two", { "r1" }),
+                make_row("one", { "r2", "r3" }),
+        });
+
+        auto l1 = cl.lock_cells(m1.decorated_key(), partition_cells_range(m1.partition())).get0();
+        auto f2 = cl.lock_cells(m2.decorated_key(), partition_cells_range(m2.partition()));
+        BOOST_REQUIRE(!f2.available());
+        destroy(std::move(l1));
+        auto l2 = f2.get0();
+        auto f3 = cl.lock_cells(m3.decorated_key(), partition_cells_range(m3.partition()));
+        BOOST_REQUIRE(!f3.available());
+        destroy(std::move(l2));
+        auto l3 = f3.get0();
+    });
+}
+
+SEASTAR_TEST_CASE(test_schema_change) {
+    return seastar::async([&] {
+        auto destroy = [] (auto) { };
+
+        auto s1 = make_schema();
+        auto s2 = make_alternative_schema();
+        cell_locker cl(s1);
+
+        auto m1 = make_mutation(s1, "0", { "s1", "s2", "s3"}, {
+            make_row("one", { "r1", "r2", "r3" }),
+        });
+
+        // disjoint with m1
+        auto m2 = make_mutation(s2, "0", { "s0", "s2.5"}, {
+                make_row("one", { "r0", "r2.5" }),
+                make_row("two", { "r1", "r3" }),
+        });
+
+        // overlaps with m1
+        auto m3 = make_mutation(s2, "0", { "s1" }, {
+                make_row("one", { "r1", "r3" }),
+        });
+
+        auto l1 = cl.lock_cells(m1.decorated_key(), partition_cells_range(m1.partition())).get0();
+
+        destroy(std::move(m1));
+        destroy(std::move(s1));
+        cl.set_schema(s2);
+
+        auto l2 = cl.lock_cells(m2.decorated_key(), partition_cells_range(m2.partition())).get0();
+        auto f3 = cl.lock_cells(m3.decorated_key(), partition_cells_range(m3.partition()));
+        BOOST_REQUIRE(!f3.available());
+        destroy(std::move(l1));
+        auto l3 = f3.get0();
+
+        auto s3 = make_schema_disjoint_with_others();
+        cl.set_schema(s3);
+
+        auto m4 = make_mutation(s3, "0", { "s8", "s9"}, {
+                make_row("one", { "r8", "r9" }),
+                make_row("two", { "r8", "r9" }),
+        });
+        auto l4 = cl.lock_cells(m4.decorated_key(), partition_cells_range(m4.partition())).get0();
+    });
+}
--- a/tests/commitlog_test.cc
+++ b/tests/commitlog_test.cc
@@ -149,6 +149,10 @@ typedef std::vector<sstring> segment_names;
 static segment_names segment_diff(commitlog& log, segment_names prev = {}) {
    segment_names now = log.get_active_segment_names();
    segment_names diff;
+    // safety fix. We should always get segment names in alphabetical order, but
+    // we're not explicitly guaranteed it. Lets sort the sets just to be sure.
+    std::sort(now.begin(), now.end());
+    std::sort(prev.begin(), prev.end());
    std::set_difference(prev.begin(), prev.end(), now.begin(), now.end(), std::back_inserter(diff));
    return diff;
 }
@@ -254,7 +258,7 @@ SEASTAR_TEST_CASE(test_commitlog_delete_when_over_disk_limit) {

            auto set = make_lw_shared<std::set<segment_id_type>>();
            auto uuid = utils::UUID_gen::get_time_UUID();
-            return do_until([set]() {return set->size() > 2;},
+            return do_until([set, sem]() {return set->size() > 2 && sem->try_wait();},
                    [&log, set, uuid]() {
                        sstring tmp = "hej bubba cow";
                        return log.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
@@ -263,14 +267,13 @@ SEASTAR_TEST_CASE(test_commitlog_delete_when_over_disk_limit) {
                                    BOOST_CHECK_NE(rp, db::replay_position());
                                    set->insert(rp.id);
                                });
-                    }).then([&log, sem, segments]() {
-                        auto names = log.get_active_segment_names();
+                    }).then([&log, segments]() {
                        auto diff = segment_diff(log, *segments);
                        auto nn = diff.size();
                        auto dn = log.get_num_segments_destroyed();

                        BOOST_REQUIRE(nn > 0);
-                        BOOST_REQUIRE(nn <= names.size());
+                        BOOST_REQUIRE(nn <= segments->size());
                        BOOST_REQUIRE(dn <= nn);
                    }).finally([r = std::move(r)] {
                    });
--- a/tests/counter_test.cc
+++ b/tests/counter_test.cc
@@ -73,6 +73,7 @@ schema_ptr get_schema() {
    return schema_builder("ks", "cf")
            .with_column("pk", int32_type, column_kind::partition_key)
            .with_column("ck", int32_type, column_kind::clustering_key)
+            .with_column("s1", counter_type, column_kind::static_column)
            .with_column("c1", counter_type)
            .build();
 }
@@ -90,6 +91,18 @@ atomic_cell_view get_counter_cell(mutation& m) {
    return *acv;
 };

+atomic_cell_view get_static_counter_cell(mutation& m) {
+    auto& mp = m.partition();
+    const auto& cells = mp.static_row();
+    BOOST_REQUIRE_EQUAL(cells.size(), 1);
+    stdx::optional<atomic_cell_view> acv;
+    cells.for_each_cell([&] (column_id, const atomic_cell_or_collection& ac_o_c) {
+        acv = ac_o_c.as_atomic_cell();
+    });
+    BOOST_REQUIRE(bool(acv));
+    return *acv;
+};
+
 SEASTAR_TEST_CASE(test_counter_mutations) {
    return seastar::async([] {
        storage_service_for_tests ssft;
@@ -101,6 +114,7 @@ SEASTAR_TEST_CASE(test_counter_mutations) {
        auto pk = partition_key::from_single_value(*s, int32_type->decompose(0));
        auto ck = clustering_key::from_single_value(*s, int32_type->decompose(0));
        auto& col = *s->get_column_definition(utf8_type->decompose(sstring("c1")));
+        auto& scol = *s->get_column_definition(utf8_type->decompose(sstring("s1")));

        mutation m1(pk, s);
        counter_cell_builder b1;
@@ -109,15 +123,28 @@ SEASTAR_TEST_CASE(test_counter_mutations) {
        b1.add_shard(counter_shard(id[2], 3, 1));
        m1.set_clustered_cell(ck, col, b1.build(api::new_timestamp()));

+        counter_cell_builder b1s;
+        b1s.add_shard(counter_shard(id[1], 4, 3));
+        b1s.add_shard(counter_shard(id[2], 5, 1));
+        b1s.add_shard(counter_shard(id[3], 6, 2));
+        m1.set_static_cell(scol, b1s.build(api::new_timestamp()));
+
        mutation m2(pk, s);
        counter_cell_builder b2;
-        b1.add_shard(counter_shard(id[0], 1, 1));
+        b2.add_shard(counter_shard(id[0], 1, 1));
        b2.add_shard(counter_shard(id[2], -5, 4));
        b2.add_shard(counter_shard(id[3], -100, 1));
        m2.set_clustered_cell(ck, col, b2.build(api::new_timestamp()));

+        counter_cell_builder b2s;
+        b2s.add_shard(counter_shard(id[0], 8, 8));
+        b2s.add_shard(counter_shard(id[1], 1, 4));
+        b2s.add_shard(counter_shard(id[3], 9, 1));
+        m2.set_static_cell(scol, b2s.build(api::new_timestamp()));
+
        mutation m3(pk, s);
        m3.set_clustered_cell(ck, col, atomic_cell::make_dead(1, gc_clock::now()));
+        m3.set_static_cell(scol, atomic_cell::make_dead(1, gc_clock::now()));

        mutation m4(pk, s);
        m4.partition().apply(tombstone(0, gc_clock::now()));
@@ -131,15 +158,23 @@ SEASTAR_TEST_CASE(test_counter_mutations) {
        counter_cell_view ccv { ac };
        BOOST_REQUIRE_EQUAL(ccv.total_value(), -102);

+        ac = get_static_counter_cell(m);
+        BOOST_REQUIRE(ac.is_live());
+        ccv = counter_cell_view(ac);
+        BOOST_REQUIRE_EQUAL(ccv.total_value(), 20);
+
        m.apply(m3);
        ac = get_counter_cell(m);
        BOOST_REQUIRE(!ac.is_live());
+        ac = get_static_counter_cell(m);
+        BOOST_REQUIRE(!ac.is_live());

        m = m1;
        m.apply(m4);
        m.partition().compact_for_query(*s, gc_clock::now(), { query::clustering_range::make_singular(ck) },
                                        false, query::max_rows);
        BOOST_REQUIRE_EQUAL(m.partition().clustered_rows().calculate_size(), 0);
+        BOOST_REQUIRE(m.partition().static_row().empty());

        // Difference

@@ -147,7 +182,12 @@ SEASTAR_TEST_CASE(test_counter_mutations) {
        ac = get_counter_cell(m);
        BOOST_REQUIRE(ac.is_live());
        ccv = counter_cell_view(ac);
-        BOOST_REQUIRE_EQUAL(ccv.total_value(), 3);
+        BOOST_REQUIRE_EQUAL(ccv.total_value(), 2);
+
+        ac = get_static_counter_cell(m);
+        BOOST_REQUIRE(ac.is_live());
+        ccv = counter_cell_view(ac);
+        BOOST_REQUIRE_EQUAL(ccv.total_value(), 11);

        m = mutation(s, m1.decorated_key(), m2.partition().difference(s, m1.partition()));
        ac = get_counter_cell(m);
@@ -155,13 +195,22 @@ SEASTAR_TEST_CASE(test_counter_mutations) {
        ccv = counter_cell_view(ac);
        BOOST_REQUIRE_EQUAL(ccv.total_value(), -105);

+        ac = get_static_counter_cell(m);
+        BOOST_REQUIRE(ac.is_live());
+        ccv = counter_cell_view(ac);
+        BOOST_REQUIRE_EQUAL(ccv.total_value(), 9);
+
        m = mutation(s, m1.decorated_key(), m1.partition().difference(s, m3.partition()));
        BOOST_REQUIRE_EQUAL(m.partition().clustered_rows().calculate_size(), 0);
+        BOOST_REQUIRE(m.partition().static_row().empty());

        m = mutation(s, m1.decorated_key(), m3.partition().difference(s, m1.partition()));
        ac = get_counter_cell(m);
        BOOST_REQUIRE(!ac.is_live());

+        ac = get_static_counter_cell(m);
+        BOOST_REQUIRE(!ac.is_live());
+
        // Freeze

        auto fm1 = freeze(m1);
@@ -206,18 +255,24 @@ SEASTAR_TEST_CASE(test_counter_update_mutations) {
        auto pk = partition_key::from_single_value(*s, int32_type->decompose(0));
        auto ck = clustering_key::from_single_value(*s, int32_type->decompose(0));
        auto& col = *s->get_column_definition(utf8_type->decompose(sstring("c1")));
+        auto& scol = *s->get_column_definition(utf8_type->decompose(sstring("s1")));

        auto c1 = atomic_cell::make_live_counter_update(api::new_timestamp(), long_type->decompose(int64_t(5)));
+        auto s1 = atomic_cell::make_live_counter_update(api::new_timestamp(), long_type->decompose(int64_t(4)));
        mutation m1(pk, s);
        m1.set_clustered_cell(ck, col, c1);
+        m1.set_static_cell(scol, s1);

        auto c2 = atomic_cell::make_live_counter_update(api::new_timestamp(), long_type->decompose(int64_t(9)));
+        auto s2 = atomic_cell::make_live_counter_update(api::new_timestamp(), long_type->decompose(int64_t(8)));
        mutation m2(pk, s);
        m2.set_clustered_cell(ck, col, c2);
+        m2.set_static_cell(scol, s2);

        auto c3 = atomic_cell::make_dead(api::new_timestamp() / 2, gc_clock::now());
        mutation m3(pk, s);
        m3.set_clustered_cell(ck, col, c3);
+        m3.set_static_cell(scol, c3);

        auto counter_update_value = [&] (atomic_cell_view acv) {
            return value_cast<int64_t>(long_type->deserialize_value(acv.value()));
@@ -230,9 +285,86 @@ SEASTAR_TEST_CASE(test_counter_update_mutations) {
        BOOST_REQUIRE(ac.is_counter_update());
        BOOST_REQUIRE_EQUAL(counter_update_value(ac), 14);

+        ac = get_static_counter_cell(m12);
+        BOOST_REQUIRE(ac.is_live());
+        BOOST_REQUIRE(ac.is_counter_update());
+        BOOST_REQUIRE_EQUAL(counter_update_value(ac), 12);
+
        auto m123 = m12;
        m123.apply(m3);
        ac = get_counter_cell(m123);
        BOOST_REQUIRE(!ac.is_live());
+
+        ac = get_static_counter_cell(m123);
+        BOOST_REQUIRE(!ac.is_live());
    });
 }
+
+SEASTAR_TEST_CASE(test_transfer_updates_to_shards) {
+    return seastar::async([] {
+        storage_service_for_tests ssft;
+
+        auto s = get_schema();
+
+        auto pk = partition_key::from_single_value(*s, int32_type->decompose(0));
+        auto ck = clustering_key::from_single_value(*s, int32_type->decompose(0));
+        auto& col = *s->get_column_definition(utf8_type->decompose(sstring("c1")));
+        auto& scol = *s->get_column_definition(utf8_type->decompose(sstring("s1")));
+
+        auto c1 = atomic_cell::make_live_counter_update(api::new_timestamp(), long_type->decompose(int64_t(5)));
+        auto s1 = atomic_cell::make_live_counter_update(api::new_timestamp(), long_type->decompose(int64_t(4)));
+        mutation m1(pk, s);
+        m1.set_clustered_cell(ck, col, c1);
+        m1.set_static_cell(scol, s1);
+
+        auto c2 = atomic_cell::make_live_counter_update(api::new_timestamp(), long_type->decompose(int64_t(9)));
+        auto s2 = atomic_cell::make_live_counter_update(api::new_timestamp(), long_type->decompose(int64_t(8)));
+        mutation m2(pk, s);
+        m2.set_clustered_cell(ck, col, c2);
+        m2.set_static_cell(scol, s2);
+
+        auto c3 = atomic_cell::make_dead(api::new_timestamp() / 2, gc_clock::now());
+        mutation m3(pk, s);
+        m3.set_clustered_cell(ck, col, c3);
+        m3.set_static_cell(scol, c3);
+
+        auto m0 = m1;
+        transform_counter_updates_to_shards(m0, nullptr, 0);
+
+        auto empty = mutation(pk, s);
+        auto m = m1;
+        transform_counter_updates_to_shards(m, &empty, 0);
+        BOOST_REQUIRE_EQUAL(m, m0);
+
+        auto ac = get_counter_cell(m);
+        BOOST_REQUIRE(ac.is_live());
+        auto ccv = counter_cell_view(ac);
+        BOOST_REQUIRE_EQUAL(ccv.total_value(), 5);
+
+        ac = get_static_counter_cell(m);
+        BOOST_REQUIRE(ac.is_live());
+        ccv = counter_cell_view(ac);
+        BOOST_REQUIRE_EQUAL(ccv.total_value(), 4);
+
+        m = m2;
+        transform_counter_updates_to_shards(m, &m0, 0);
+
+        ac = get_counter_cell(m);
+        BOOST_REQUIRE(ac.is_live());
+        ccv = counter_cell_view(ac);
+        BOOST_REQUIRE_EQUAL(ccv.total_value(), 14);
+
+        ac = get_static_counter_cell(m);
+        BOOST_REQUIRE(ac.is_live());
+        ccv = counter_cell_view(ac);
+        BOOST_REQUIRE_EQUAL(ccv.total_value(), 12);
+
+        m = m3;
+        transform_counter_updates_to_shards(m, &m0, 0);
+        ac = get_counter_cell(m);
+        BOOST_REQUIRE(!ac.is_live());
+        ac = get_static_counter_cell(m);
+        BOOST_REQUIRE(!ac.is_live());
+    });
+}
+
--- a/tests/cql_test_env.cc
+++ b/tests/cql_test_env.cc
@@ -279,7 +279,7 @@ public:
            auto stop_ms = defer([&ms] { ms.stop().get(); });

            auto& ss = service::get_storage_service();
-            ss.start(std::ref(*db));
+            ss.start(std::ref(*db)).get();
            auto stop_storage_service = defer([&ss] { ss.stop().get(); });

            db->start(std::move(*cfg)).get();
--- a/tests/logalloc_test.cc
+++ b/tests/logalloc_test.cc
@@ -29,7 +29,9 @@
 #include <seastar/core/timer.hh>
 #include <seastar/core/sleep.hh>
 #include <seastar/tests/test-utils.hh>
+#include <seastar/util/defer.hh>
 #include <deque>
+#include "utils/phased_barrier.hh"

 #include "utils/logalloc.hh"
 #include "utils/managed_ref.hh"
@@ -102,7 +104,7 @@ SEASTAR_TEST_CASE(test_compaction_with_multiple_regions) {
        std::vector<managed_ref<int>> allocated1;
        std::vector<managed_ref<int>> allocated2;

-        int count = 32 * 1024 * 4;
+        int count = 32 * 1024 * 4 * 2;
        
        with_allocator(reg1.allocator(), [&] {
            for (int i = 0; i < count; i++) {
@@ -529,11 +531,7 @@ inline void quiesce(FutureType&& fut) {
    // a request may be broken into many continuations. While we could just yield many times, the
    // exact amount needed to guarantee execution would be dependent on the internals of the
    // implementation, we want to avoid that.
-    timer<> tmr;
-    tmr.set_callback([] { BOOST_FAIL("The future we were waiting for took too long to get ready"); });
-    tmr.arm(2s);
-    fut.get();
-    tmr.cancel();
+    with_timeout(lowres_clock::now() + 2s, std::move(fut)).get();
 }

 // Simple RAII structure that wraps around a region_group
@@ -859,15 +857,22 @@ class test_reclaimer: public region_group_reclaimer {
    region_group _rg;
    std::vector<size_t> _reclaim_sizes;
    bool _shutdown = false;
+    shared_promise<> _unleash_reclaimer;
+    seastar::gate _reclaimers_done;
 public:
-    virtual void start_reclaiming() override {
-        while (this->under_pressure()) {
-            size_t reclaimed = test_async_reclaim_region::from_region(_rg.get_largest_region()).evict();
-            _result_accumulator->_reclaim_sizes.push_back(reclaimed);
-        }
+    virtual void start_reclaiming() noexcept override {
+        with_gate(_reclaimers_done, [this] {
+            return _unleash_reclaimer.get_shared_future().then([this] {
+                while (this->under_pressure()) {
+                    size_t reclaimed = test_async_reclaim_region::from_region(_rg.get_largest_region()).evict();
+                    _result_accumulator->_reclaim_sizes.push_back(reclaimed);
+                }
+            });
+        });
    }

    ~test_reclaimer() {
+        _reclaimers_done.close().get();
        _rg.shutdown().get();
    }

@@ -881,6 +886,10 @@ public:

    test_reclaimer(size_t threshold) : region_group_reclaimer(threshold), _result_accumulator(this), _rg(*this) {}
    test_reclaimer(test_reclaimer& parent, size_t threshold) : region_group_reclaimer(threshold), _result_accumulator(&parent), _rg(&parent._rg, *this) {}
+
+    void unleash() {
+        _unleash_reclaimer.set_value();
+    }
 };

 SEASTAR_TEST_CASE(test_region_groups_basic_throttling_simple_active_reclaim) {
@@ -888,6 +897,7 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_simple_active_reclaim) {
        // allocate a single region to exhaustion, and make sure active reclaim is activated.
        test_reclaimer simple(logalloc::segment_size);
        test_async_reclaim_region simple_region(simple.rg(), logalloc::segment_size);
+        simple.unleash();

        // Can't run this function until we have reclaimed something
        auto fut = simple.rg().run_when_memory_available([] {});
@@ -912,6 +922,7 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_worst_offen
        test_async_reclaim_region small_region(simple.rg(), logalloc::segment_size);
        test_async_reclaim_region medium_region(simple.rg(), 2 * logalloc::segment_size);
        test_async_reclaim_region big_region(simple.rg(), 3 * logalloc::segment_size);
+        simple.unleash();

        // Can't run this function until we have reclaimed
        auto fut = simple.rg().run_when_memory_available([&simple] {
@@ -941,6 +952,9 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_leaf_offend
        test_async_reclaim_region small_region(small_leaf.rg(), logalloc::segment_size);
        test_async_reclaim_region medium_region(root.rg(), 2 * logalloc::segment_size);
        test_async_reclaim_region big_region(large_leaf.rg(), 3 * logalloc::segment_size);
+        root.unleash();
+        large_leaf.unleash();
+        small_leaf.unleash();

        // Can't run this function until we have reclaimed. Try at the root, and we'll make sure
        // that the leaves are forced correctly.
@@ -967,6 +981,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_ancestor_bl
        test_reclaimer leaf(root, logalloc::segment_size);

        test_async_reclaim_region root_region(root.rg(), logalloc::segment_size);
+        root.unleash();
+        leaf.unleash();

        // Can't run this function until we have reclaimed. Try at the leaf, and we'll make sure
        // that the root reclaims
@@ -992,6 +1008,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_big_region_
        test_async_reclaim_region root_region(root.rg(), 4 * logalloc::segment_size);
        test_async_reclaim_region big_leaf_region(leaf.rg(), 3 * logalloc::segment_size);
        test_async_reclaim_region small_leaf_region(leaf.rg(), 2 * logalloc::segment_size);
+        root.unleash();
+        leaf.unleash();

        auto fut = root.rg().run_when_memory_available([&root] {
            BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 3);
@@ -1018,6 +1036,8 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_no_double_r
        test_reclaimer leaf(root, logalloc::segment_size);

        test_async_reclaim_region leaf_region(leaf.rg(), logalloc::segment_size);
+        root.unleash();
+        leaf.unleash();

        auto fut_root = root.rg().run_when_memory_available([&root] {
            BOOST_REQUIRE_EQUAL(root.reclaim_sizes().size(), 1);
@@ -1037,3 +1057,117 @@ SEASTAR_TEST_CASE(test_region_groups_basic_throttling_active_reclaim_no_double_r
        BOOST_REQUIRE_EQUAL(root.reclaim_sizes()[0], logalloc::segment_size);
    });
 }
+
+// Reproduces issue #2021
+SEASTAR_TEST_CASE(test_no_crash_when_a_lot_of_requests_released_which_change_region_group_size) {
+    return seastar::async([] {
+#ifndef DEFAULT_ALLOCATOR // Because we need memory::stats().free_memory();
+        logging::logger_registry().set_logger_level("lsa", seastar::log_level::debug);
+
+        auto free_space = memory::stats().free_memory();
+        size_t threshold = size_t(0.75 * free_space);
+        region_group_reclaimer recl(threshold, threshold);
+        region_group gr(recl);
+        auto close_gr = defer([&gr] { gr.shutdown().get(); });
+        region r(gr);
+
+        with_allocator(r.allocator(), [&] {
+            std::vector<managed_bytes> objs;
+
+            r.make_evictable([&] {
+                if (objs.empty()) {
+                    return memory::reclaiming_result::reclaimed_nothing;
+                }
+                with_allocator(r.allocator(), [&] {
+                    objs.pop_back();
+                });
+                return memory::reclaiming_result::reclaimed_something;
+            });
+
+            auto fill_to_pressure = [&] {
+                while (!recl.under_pressure()) {
+                    objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), 1024));
+                }
+            };
+
+            utils::phased_barrier request_barrier;
+            auto wait_for_requests = defer([&] { request_barrier.advance_and_await().get(); });
+
+            for (int i = 0; i < 1000000; ++i) {
+                fill_to_pressure();
+                future<> f = gr.run_when_memory_available([&, op = request_barrier.start()] {
+                    // Trigger group size change (Refs issue #2021)
+                    gr.update(-10);
+                    gr.update(+10);
+                });
+                BOOST_REQUIRE(!f.available());
+            }
+
+            // Release
+            while (recl.under_pressure()) {
+                objs.pop_back();
+            }
+        });
+#endif
+    });
+}
+
+SEASTAR_TEST_CASE(test_reclaiming_runs_as_long_as_there_is_soft_pressure) {
+    return seastar::async([] {
+        size_t hard_threshold = logalloc::segment_size * 8;
+        size_t soft_threshold = hard_threshold / 2;
+
+        class reclaimer : public region_group_reclaimer {
+            bool _reclaim = false;
+        protected:
+            void start_reclaiming() noexcept override {
+                _reclaim = true;
+            }
+
+            void stop_reclaiming() noexcept override {
+                _reclaim = false;
+            }
+        public:
+            reclaimer(size_t hard_threshold, size_t soft_threshold)
+                : region_group_reclaimer(hard_threshold, soft_threshold)
+            { }
+            bool reclaiming() const { return _reclaim; };
+        };
+
+        reclaimer recl(hard_threshold, soft_threshold);
+        region_group gr(recl);
+        auto close_gr = defer([&gr] { gr.shutdown().get(); });
+        region r(gr);
+
+        with_allocator(r.allocator(), [&] {
+            std::vector<managed_bytes> objs;
+
+            BOOST_REQUIRE(!recl.reclaiming());
+
+            while (!recl.over_soft_limit()) {
+                objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), logalloc::segment_size));
+            }
+
+            BOOST_REQUIRE(recl.reclaiming());
+
+            while (!recl.under_pressure()) {
+                objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), logalloc::segment_size));
+            }
+
+            BOOST_REQUIRE(recl.reclaiming());
+
+            while (recl.under_pressure()) {
+                objs.pop_back();
+            }
+
+            BOOST_REQUIRE(recl.over_soft_limit());
+            BOOST_REQUIRE(recl.reclaiming());
+
+            while (recl.over_soft_limit()) {
+                objs.pop_back();
+            }
+
+            BOOST_REQUIRE(!recl.reclaiming());
+        });
+    });
+}
--- a/tests/lsa_async_eviction_test.cc
+++ b/tests/lsa_async_eviction_test.cc
@@ -76,13 +76,16 @@ int main(int argc, char** argv) {
                });

                uint64_t counter = 0;
+                logalloc::allocating_section alloc_sect;
+                alloc_sect.set_lsa_reserve(0);
+                alloc_sect.set_std_reserve(0);

                while (counter < obj_count) {
-                    auto obj = managed_bytes(managed_bytes::initialized_later(), obj_size);
-                    {
+                    alloc_sect(r, [&] {
+                        auto obj = managed_bytes(managed_bytes::initialized_later(), obj_size);
                        logalloc::reclaim_lock l(r);
                        refs.push_back(std::move(obj));
-                    }
+                    });

                    ++counter;

--- a/tests/mutation_source_test.cc
+++ b/tests/mutation_source_test.cc
@@ -191,7 +191,6 @@ static mutation_sets generate_mutation_sets() {
                .with_column("ck_col_2", bytes_type, column_kind::clustering_key)
                .with_column("regular_col_1", bytes_type)
                .with_column("regular_col_2", bytes_type)
-                .with_column("regular_counter_col_1", counter_type)
                .with_column("static_col_1", bytes_type, column_kind::static_column)
                .with_column("static_col_2", bytes_type, column_kind::static_column);

@@ -300,9 +299,20 @@ static mutation_sets generate_mutation_sets() {
        }
    }

+    static constexpr auto rmg_iterations = 10;
+
    {
-        random_mutation_generator gen;
-        for (int i = 0; i < 10; ++i) {
+        random_mutation_generator gen(random_mutation_generator::generate_counters::no);
+        for (int i = 0; i < rmg_iterations; ++i) {
+            auto m = gen();
+            result.unequal.emplace_back(mutations{m, gen()}); // collision unlikely
+            result.equal.emplace_back(mutations{m, m});
+        }
+    }
+
+    {
+        random_mutation_generator gen(random_mutation_generator::generate_counters::yes);
+        for (int i = 0; i < rmg_iterations; ++i) {
            auto m = gen();
            result.unequal.emplace_back(mutations{m, gen()}); // collision unlikely
            result.equal.emplace_back(mutations{m, m});
@@ -364,6 +374,7 @@ bytes make_blob(size_t blob_size) {

 class random_mutation_generator::impl {
    friend class random_mutation_generator;
+    generate_counters _generate_counters;
    const size_t _external_blob_size = 128; // Should be enough to force use of external bytes storage
    const column_id column_count = row::max_vector_size * 2;
    std::mt19937 _gen;
@@ -375,30 +386,33 @@ class random_mutation_generator::impl {
        return gc_clock::time_point() + std::chrono::seconds(dist(gen));
    }

-public:
-    schema_ptr make_schema() {
+    schema_ptr do_make_schema(data_type type) {
        auto builder = schema_builder("ks", "cf")
                .with_column("pk", bytes_type, column_kind::partition_key)
                .with_column("ck1", bytes_type, column_kind::clustering_key)
-                .with_column("ck2", bytes_type, column_kind::clustering_key)
-                .with_column("c1", counter_type);
+                .with_column("ck2", bytes_type, column_kind::clustering_key);

        // Create enough columns so that row can overflow its vector storage
        for (column_id i = 0; i < column_count; ++i) {
            {
                auto column_name = sprint("v%d", i);
-                builder.with_column(to_bytes(column_name), bytes_type, column_kind::regular_column);
+                builder.with_column(to_bytes(column_name), type, column_kind::regular_column);
            }
            {
                auto column_name = sprint("s%d", i);
-                builder.with_column(to_bytes(column_name), bytes_type, column_kind::static_column);
+                builder.with_column(to_bytes(column_name), type, column_kind::static_column);
            }
        }

        return builder.build();
    }

-    impl() {
+    schema_ptr make_schema() {
+        return _generate_counters ? do_make_schema(counter_type)
+                                  : do_make_schema(bytes_type);
+    }
+public:
+    explicit impl(generate_counters counters) : _generate_counters(counters) {
        _schema = make_schema();

        for (int i = 0; i < 1024; ++i) {
@@ -424,8 +438,6 @@ public:
        auto pkey = partition_key::from_single_value(*_schema, _blobs[0]);
        mutation m(pkey, _schema);

-        auto& counter_column = *_schema->get_column_definition(utf8_type->decompose(sstring("c1")));
-
        std::map<counter_id, std::set<int64_t>> counter_used_clock_values;
        std::vector<counter_id> counter_ids;
        std::generate_n(std::back_inserter(counter_ids), 8, counter_id::generate_random);
@@ -459,16 +471,16 @@ public:
            auto columns_to_set = column_count_dist(_gen);
            for (column_id i = 0; i < columns_to_set; ++i) {
                auto cid = column_id_dist(_gen);
-                if (kind == column_kind::regular_column && cid == counter_column.id) {
-                    auto cell = bool_dist(_gen)
-                                ? random_counter_cell()
-                                : atomic_cell::make_dead(timestamp_dist(_gen), expiry_dist(_gen));
-                    r.apply(_schema->column_at(kind, cid), std::move(cell));
-                    continue;
-                }
+                auto get_live_cell = [&] {
+                    if (_generate_counters) {
+                        return random_counter_cell();
+                    } else {
+                        return atomic_cell::make_live(timestamp_dist(_gen), _blobs[value_blob_index_dist(_gen)]);
+                    }
+                };
                // FIXME: generate expiring cells
                auto cell = bool_dist(_gen)
-                            ? atomic_cell::make_live(timestamp_dist(_gen), _blobs[value_blob_index_dist(_gen)])
+                            ? get_live_cell()
                            : atomic_cell::make_dead(timestamp_dist(_gen), expiry_dist(_gen));
                r.apply(_schema->column_at(kind, cid), std::move(cell));
            }
@@ -529,8 +541,8 @@ public:

 random_mutation_generator::~random_mutation_generator() {}

-random_mutation_generator::random_mutation_generator()
-    : _impl(std::make_unique<random_mutation_generator::impl>())
+random_mutation_generator::random_mutation_generator(generate_counters counters)
+    : _impl(std::make_unique<random_mutation_generator::impl>(counters))
 { }

 mutation random_mutation_generator::operator()() {
--- a/tests/mutation_source_test.hh
+++ b/tests/mutation_source_test.hh
@@ -37,11 +37,19 @@ void for_each_mutation_pair(std::function<void(const mutation&, const mutation&,
 // Calls the provided function on mutations. Is supposed to exercise as many differences as possible.
 void for_each_mutation(std::function<void(const mutation&)>);

+// Returns true if mutations in schema s1 can be upgraded to s2.
+inline bool can_upgrade_schema(schema_ptr from, schema_ptr to) {
+    return from->is_counter() == to->is_counter();
+}
+
 class random_mutation_generator {
    class impl;
    std::unique_ptr<impl> _impl;
 public:
-    random_mutation_generator();
+    struct generate_counters_tag { };
+    using generate_counters = bool_class<generate_counters_tag>;
+
+    explicit random_mutation_generator(generate_counters);
    ~random_mutation_generator();
    mutation operator()();
    schema_ptr schema() const;
--- a/tests/mutation_test.cc
+++ b/tests/mutation_test.cc
@@ -795,8 +795,7 @@ public:
 };

 SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) {
-    random_mutation_generator gen;
-
+  auto do_test = [] (auto&& gen) {
    failure_injecting_allocation_strategy alloc(standard_allocator());
    with_allocator(alloc, [&] {
        auto target = gen();
@@ -857,7 +856,10 @@ SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) {
            }
        }
    });
+  };

+    do_test(random_mutation_generator(random_mutation_generator::generate_counters::no));
+    do_test(random_mutation_generator(random_mutation_generator::generate_counters::yes));
    return make_ready_future<>();
 }

--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`options raid0 devices_discard_performance=Y`