release: prepare for 2.0.rc2

Signed-off-by: Shlomi Livne <shlomi@scylladb.com>
docker: Switch to Scylla 2.0 RPM repository
2017-08-15 15:52:35 +03:00 · 2017-08-15 13:27:41 +03:00 · 2017-08-14 16:24:05 +03:00 · 2017-08-14 15:10:27 +03:00 · 2017-08-14 10:30:42 +03:00 · 2017-08-10 12:05:14 +03:00
104 changed files with 2028 additions and 876 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=2.0.rc2

 if test -f version
 then
--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -252,13 +252,13 @@ void set_cache_service(http_context& ctx, routes& r) {
        // In origin row size is the weighted size.
        // We currently do not support weights, so we use num entries instead
        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().num_entries();
+            return cf.get_row_cache().partitions();
        }, std::plus<uint64_t>());
    });

    cs::get_row_entries.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().num_entries();
+            return cf.get_row_cache().partitions();
        }, std::plus<uint64_t>());
    });

--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -130,7 +130,7 @@ public:
                        }) {}

    future<> stop() {
-        return make_ready_future<>();
+        return _cache.stop();
    }

    future<permission_set> get(::shared_ptr<authenticated_user> user, data_resource resource) {
--- a/cache_streamed_mutation.hh
+++ b/cache_streamed_mutation.hh
@@ -110,7 +110,7 @@ class cache_streamed_mutation final : public streamed_mutation::impl {
    // Emits all delayed range tombstones.
    void drain_tombstones();
    void add_to_buffer(const partition_snapshot_row_cursor&);
-    void add_to_buffer(clustering_row&&);
+    void add_clustering_row_to_buffer(mutation_fragment&&);
    void add_to_buffer(range_tombstone&&);
    void add_to_buffer(mutation_fragment&&);
    future<> read_from_underlying();
@@ -154,12 +154,14 @@ public:
 inline
 future<> cache_streamed_mutation::process_static_row() {
    if (_snp->version()->partition().static_row_continuous()) {
+        _read_context->cache().on_row_hit();
        row sr = _snp->static_row();
        if (!sr.empty()) {
            push_mutation_fragment(mutation_fragment(static_row(std::move(sr))));
        }
        return make_ready_future<>();
    } else {
+        _read_context->cache().on_row_miss();
        return _read_context->get_next_fragment().then([this] (mutation_fragment_opt&& sr) {
            if (sr) {
                assert(sr->is_static_row());
@@ -175,13 +177,22 @@ inline
 future<> cache_streamed_mutation::fill_buffer() {
    if (!_static_row_done) {
        _static_row_done = true;
-        return process_static_row().then([this] {
+        auto after_static_row = [this] {
+            if (_ck_ranges_curr == _ck_ranges_end) {
+                _end_of_stream = true;
+                return make_ready_future<>();
+            }
            return _lsa_manager.run_in_read_section([this] {
                return move_to_current_range();
            }).then([this] {
                return fill_buffer();
            });
-        });
+        };
+        if (_schema->has_static_columns()) {
+            return process_static_row().then(std::move(after_static_row));
+        } else {
+            return after_static_row();
+        }
    }
    return do_until([this] { return _end_of_stream || is_buffer_full(); }, [this] {
        return do_fill_buffer();
@@ -210,33 +221,33 @@ future<> cache_streamed_mutation::do_fill_buffer() {

 inline
 future<> cache_streamed_mutation::read_from_underlying() {
-    return do_until([this] { return !_reading_underlying || is_buffer_full(); }, [this] {
-        return _read_context->get_next_fragment().then([this] (auto&& mfopt) {
-            if (!mfopt) {
-                _reading_underlying = false;
-                return _lsa_manager.run_in_update_section([this] {
-                    auto same_pos = _next_row.maybe_refresh();
-                    assert(same_pos); // FIXME: handle eviction
-                    if (_next_row_in_range) {
+    return consume_mutation_fragments_until(_read_context->get_streamed_mutation(),
+        [this] { return !_reading_underlying || is_buffer_full(); },
+        [this] (mutation_fragment mf) {
+            _read_context->cache().on_row_miss();
+            maybe_add_to_cache(mf);
+            add_to_buffer(std::move(mf));
+        },
+        [this] {
+            _reading_underlying = false;
+            return _lsa_manager.run_in_update_section([this] {
+                auto same_pos = _next_row.maybe_refresh();
+                assert(same_pos); // FIXME: handle eviction
+                if (_next_row_in_range) {
+                    maybe_update_continuity();
+                    add_to_buffer(_next_row);
+                    return move_to_next_entry();
+                } else {
+                    if (no_clustering_row_between(*_schema, _upper_bound, _next_row.position())) {
                        this->maybe_update_continuity();
-                        this->add_to_buffer(_next_row);
-                        return this->move_to_next_entry();
                    } else {
-                        if (no_clustering_row_between(*_schema, _upper_bound, _next_row.position())) {
-                            this->maybe_update_continuity();
-                        } else {
-                            // FIXME: Insert dummy entry at _upper_bound.
-                        }
-                        return this->move_to_next_range();
+                        // FIXME: Insert dummy entry at _upper_bound.
+                        _read_context->cache().on_mispopulate();
                    }
-                });
-            } else {
-                this->maybe_add_to_cache(*mfopt);
-                this->add_to_buffer(std::move(*mfopt));
-                return make_ready_future<>();
-            }
+                    return move_to_next_range();
+                }
+            });
        });
-    });
 }

 inline
@@ -249,6 +260,8 @@ void cache_streamed_mutation::maybe_update_continuity() {
        } else if (!_ck_ranges_curr->start()) {
            _next_row.set_continuous(true);
        }
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

@@ -266,6 +279,7 @@ void cache_streamed_mutation::maybe_add_to_cache(const mutation_fragment& mf) {
 inline
 void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
    if (!can_populate()) {
+        _read_context->cache().on_mispopulate();
        return;
    }
    _lsa_manager.run_in_update_section_with_allocator([this, &cr] {
@@ -285,6 +299,7 @@ void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
                  ? _next_row.get_iterator_in_latest_version() : mp.clustered_rows().lower_bound(cr.key(), less);
        auto insert_result = mp.clustered_rows().insert_check(it, *new_entry, less);
        if (insert_result.second) {
+            _read_context->cache().on_row_insert();
            new_entry.release();
        }
        it = insert_result.first;
@@ -294,11 +309,12 @@ void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
            if (it == mp.clustered_rows().begin()) {
                // FIXME: check whether entry for _last_row_key is in older versions and if so set
                // continuity to true.
+                _read_context->cache().on_mispopulate();
            } else {
                auto prev_it = it;
                --prev_it;
-                clustering_key_prefix::tri_compare tri_comp(*_schema);
-                if (tri_comp(*_last_row_key, prev_it->key()) == 0) {
+                clustering_key_prefix::equality eq(*_schema);
+                if (eq(*_last_row_key, prev_it->key())) {
                    e.set_continuous(true);
                }
            }
@@ -306,6 +322,7 @@ void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
            e.set_continuous(true);
        } else {
            // FIXME: Insert dummy entry at _ck_ranges_curr->start()
+            _read_context->cache().on_mispopulate();
        }
    });
 }
@@ -405,7 +422,7 @@ void cache_streamed_mutation::drain_tombstones() {
 inline
 void cache_streamed_mutation::add_to_buffer(mutation_fragment&& mf) {
    if (mf.is_clustering_row()) {
-        add_to_buffer(std::move(std::move(mf).as_clustering_row()));
+        add_clustering_row_to_buffer(std::move(mf));
    } else {
        assert(mf.is_range_tombstone());
        add_to_buffer(std::move(mf).as_range_tombstone());
@@ -415,16 +432,18 @@ void cache_streamed_mutation::add_to_buffer(mutation_fragment&& mf) {
 inline
 void cache_streamed_mutation::add_to_buffer(const partition_snapshot_row_cursor& row) {
    if (!row.dummy()) {
-        add_to_buffer(row.row());
+        _read_context->cache().on_row_hit();
+        add_clustering_row_to_buffer(row.row());
    }
 }

 inline
-void cache_streamed_mutation::add_to_buffer(clustering_row&& row) {
+void cache_streamed_mutation::add_clustering_row_to_buffer(mutation_fragment&& mf) {
+    auto& row = mf.as_clustering_row();
    drain_tombstones(row.position());
    _last_row_key = row.key();
    _lower_bound = position_in_partition::after_key(row.key());
-    push_mutation_fragment(std::move(row));
+    push_mutation_fragment(std::move(mf));
 }

 inline
@@ -444,15 +463,20 @@ void cache_streamed_mutation::maybe_add_to_cache(const range_tombstone& rt) {
        _lsa_manager.run_in_update_section_with_allocator([&] {
            _snp->version()->partition().apply_row_tombstone(*_schema, rt);
        });
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

 inline
 void cache_streamed_mutation::maybe_add_to_cache(const static_row& sr) {
    if (can_populate()) {
+        _read_context->cache().on_row_insert();
        _lsa_manager.run_in_update_section_with_allocator([&] {
            _snp->version()->partition().static_row().apply(*_schema, column_kind::static_column, sr.cells());
        });
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

@@ -460,6 +484,8 @@ inline
 void cache_streamed_mutation::maybe_set_static_row_continuous() {
    if (can_populate()) {
        _snp->version()->partition().set_static_row_continuous(true);
+    } else {
+        _read_context->cache().on_mispopulate();
    }
 }

--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -499,14 +499,15 @@ public:
            , _is_compound(true)
    { }

-    std::vector<bytes> explode() const {
+    std::vector<bytes_view> explode() const {
        if (!_is_compound) {
-            return { to_bytes(_bytes) };
+            return { _bytes };
        }

-        std::vector<bytes> ret;
+        std::vector<bytes_view> ret;
+        ret.reserve(8);
        for (auto it = begin(), e = end(); it != e; ) {
-            ret.push_back(to_bytes(it->first));
+            ret.push_back(it->first);
            auto marker = it->second;
            ++it;
            if (it != e && marker != composite::eoc::none) {
--- a/cpu_controller.hh
+++ b/cpu_controller.hh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <seastar/core/thread.hh>
+#include <seastar/core/timer.hh>
+#include <chrono>
+
+// Simple proportional controller to adjust shares of memtable/streaming flushes.
+//
+// Goal is to flush as fast as we can, but not so fast that we steal all the CPU from incoming
+// requests, and at the same time minimize user-visible fluctuations in the flush quota.
+//
+// What that translates to is we'll try to keep virtual dirty's firt derivative at 0 (IOW, we keep
+// virtual dirty constant), which means that the rate of incoming writes is equal to the rate of
+// flushed bytes.
+//
+// The exact point at which the controller stops determines the desired flush CPU usage. As we
+// approach the hard dirty limit, we need to be more aggressive. We will therefore define two
+// thresholds, and increase the constant as we cross them.
+//
+//  1) the soft limit line
+//  2) halfway between soft limit and dirty limit
+//
+// The constants q1 and q2 are used to determine the proportional factor at each stage.
+//
+// Below the soft limit, we are in no particular hurry to flush, since it means we're set to
+// complete flushing before we a new memtable is ready. The quota is dirty * q1, and q1 is set to a
+// low number.
+//
+// The first half of the virtual dirty region is where we expect to be usually, so we have a low
+// slope corresponding to a sluggish response between q1 * soft_limit and q2.
+//
+// In the second half, we're getting close to the hard dirty limit so we increase the slope and
+// become more responsive, up to a maximum quota of qmax.
+//
+// For now we'll just set them in the structure not to complicate the constructor. But q1, q2 and
+// qmax can easily become parameters if we find another user.
+class flush_cpu_controller {
+    static constexpr float hard_dirty_limit = 0.50;
+    static constexpr float q1 = 0.01;
+    static constexpr float q2 = 0.2;
+    static constexpr float qmax = 1;
+
+    float _current_quota = 0.0f;
+    float _goal;
+    std::function<float()> _current_dirty;
+    std::chrono::milliseconds _interval;
+    timer<> _update_timer;
+
+    seastar::thread_scheduling_group _scheduling_group;
+    seastar::thread_scheduling_group *_current_scheduling_group = nullptr;
+
+    void adjust();
+public:
+    seastar::thread_scheduling_group* scheduling_group() {
+        return _current_scheduling_group;
+    }
+    float current_quota() const {
+        return _current_quota;
+    }
+
+    struct disabled {
+        seastar::thread_scheduling_group *backup;
+    };
+    flush_cpu_controller(disabled d) : _scheduling_group(std::chrono::nanoseconds(0), 0), _current_scheduling_group(d.backup) {}
+    flush_cpu_controller(std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty);
+    flush_cpu_controller(flush_cpu_controller&&) = default;
+};
+
+
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -1550,6 +1550,8 @@ basic_unreserved_keyword returns [sstring str]
        | K_DISTINCT
        | K_CONTAINS
        | K_STATIC
+        | K_FROZEN
+        | K_TUPLE
        | K_FUNCTION
        | K_AGGREGATE
        | K_SFUNC
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -75,6 +75,10 @@ functions::init() {
    declare(aggregate_fcts::make_max_function<double>());
    declare(aggregate_fcts::make_min_function<double>());

+    declare(aggregate_fcts::make_count_function<sstring>());
+    declare(aggregate_fcts::make_max_function<sstring>());
+    declare(aggregate_fcts::make_min_function<sstring>());
+
    //FIXME:
    //declare(aggregate_fcts::make_count_function<bytes>());
    //declare(aggregate_fcts::make_max_function<bytes>());
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -120,7 +120,7 @@ public:
                if (restriction->is_slice()) {
                    throw exceptions::invalid_request_exception(sprint(
                        "PRIMARY KEY column \"%s\" cannot be restricted (preceding column \"%s\" is restricted by a non-EQ relation)",
-                        _restrictions->next_column(new_column)->name_as_text(), new_column.name_as_text()));
+                        last_column.name_as_text(), new_column.name_as_text()));
                }
            }

--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -63,7 +63,7 @@ void cql3::statements::alter_keyspace_statement::validate(distributed<service::s
        service::get_local_storage_proxy().get_db().local().find_keyspace(_name); // throws on failure
        auto tmp = _name;
        std::transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower);
-        if (tmp == db::system_keyspace::NAME) {
+        if (is_system_keyspace(tmp)) {
            throw exceptions::invalid_request_exception("Cannot alter system keyspace");
        }

--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -41,6 +41,8 @@

 #include "cql3/statements/cf_prop_defs.hh"

+#include <boost/algorithm/string/predicate.hpp>
+
 namespace cql3 {

 namespace statements {
@@ -65,6 +67,8 @@ const sstring cf_prop_defs::KW_CRC_CHECK_CHANCE = "crc_check_chance";

 const sstring cf_prop_defs::COMPACTION_STRATEGY_CLASS_KEY = "class";

+const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";
+
 void cf_prop_defs::validate() {
    // Skip validation if the comapction strategy class is already set as it means we've alreayd
    // prepared (and redoing it would set strategyClass back to null, which we don't want)
@@ -188,6 +192,13 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder) {
    builder.set_min_compaction_threshold(min_compaction_threshold);
    builder.set_max_compaction_threshold(max_compaction_threshold);

+    if (has_property(KW_COMPACTION)) {
+        if (get_compaction_options().count(COMPACTION_ENABLED_KEY)) {
+            auto enabled = boost::algorithm::iequals(get_compaction_options().at(COMPACTION_ENABLED_KEY), "true");
+            builder.set_compaction_enabled(enabled);
+        }
+    }
+
    builder.set_default_time_to_live(gc_clock::duration(get_int(KW_DEFAULT_TIME_TO_LIVE, DEFAULT_DEFAULT_TIME_TO_LIVE)));

    if (has_property(KW_SPECULATIVE_RETRY)) {
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -73,6 +73,7 @@ public:
    static const sstring KW_CRC_CHECK_CHANCE;

    static const sstring COMPACTION_STRATEGY_CLASS_KEY;
+    static const sstring COMPACTION_ENABLED_KEY;

    // FIXME: In origin the following consts are in CFMetaData.
    static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -72,7 +72,7 @@ void create_keyspace_statement::validate(distributed<service::storage_proxy>&, c
    std::string name;
    name.resize(_name.length());
    std::transform(_name.begin(), _name.end(), name.begin(), ::tolower);
-    if (name == db::system_keyspace::NAME) {
+    if (is_system_keyspace(name)) {
        throw exceptions::invalid_request_exception("system keyspace is not user-modifiable");
    }
    // keyspace name
--- a/database.cc
+++ b/database.cc
@@ -65,13 +65,13 @@
 #include <core/fstream.hh>
 #include <seastar/core/enum.hh>
 #include "utils/latency.hh"
-#include "utils/flush_queue.hh"
 #include "schema_registry.hh"
 #include "service/priority_manager.hh"
 #include "cell_locking.hh"
 #include <seastar/core/execution_stage.hh>
 #include "view_info.hh"
 #include "memtable-sstable.hh"
+#include "db/schema_tables.hh"

 #include "checked-file-impl.hh"
 #include "disk-error-handler.hh"
@@ -84,28 +84,10 @@ static const std::unordered_set<sstring> system_keyspaces = {
                db::system_keyspace::NAME, db::schema_tables::NAME
 };

-static bool is_system_keyspace(const sstring& name) {
+bool is_system_keyspace(const sstring& name) {
    return system_keyspaces.find(name) != system_keyspaces.end();
 }

-// Slight extension to the flush_queue type.
-class column_family::memtable_flush_queue : public utils::flush_queue<db::replay_position> {
-public:
-    template<typename Func, typename Post>
-    auto run_cf_flush(db::replay_position rp, Func&& func, Post&& post) {
-        // special case: empty rp, yet still data.
-        // We generate a few memtables with no valid, "high_rp", yet
-        // still containing data -> actual flush.
-        // And to make matters worse, we can initiate a flush of N such
-        // tables at the same time.
-        // Just queue them at the end of the queue and treat them as such.
-        if (rp == db::replay_position() && !empty()) {
-            rp = highest_key();
-        }
-        return run_with_ordered_post_op(rp, std::forward<Func>(func), std::forward<Post>(post));
-    }
-};
-
 // Used for tests where the CF exists without a database object. We need to pass a valid
 // dirty_memory manager in that case.
 thread_local dirty_memory_manager default_dirty_memory_manager;
@@ -147,7 +129,6 @@ column_family::column_family(schema_ptr schema, config config, db::commitlog* cl
    , _cache(_schema, sstables_as_snapshot_source(), global_cache_tracker())
    , _commitlog(cl)
    , _compaction_manager(compaction_manager)
-    , _flush_queue(std::make_unique<memtable_flush_queue>())
    , _counter_cell_locks(std::make_unique<cell_locker>(_schema, cl_stats))
 {
    if (!_config.enable_disk_writes) {
@@ -190,7 +171,6 @@ column_family::sstables_as_mutation_source() {
 snapshot_source
 column_family::sstables_as_snapshot_source() {
    return snapshot_source([this] () {
-        // FIXME: Will keep sstables on disk until next memtable flush. Make compaction force cache refresh.
        auto sst_set = _sstables;
        return mutation_source([this, sst_set = std::move(sst_set)] (schema_ptr s,
                const dht::partition_range& r,
@@ -890,17 +870,19 @@ column_family::seal_active_streaming_memtable_immediate() {
            //
            // Lastly, we don't have any commitlog RP to update, and we don't need to deal manipulate the
            // memtable list, since this memtable was not available for reading up until this point.
-            return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority).then([this, newtab, old] {
+            return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, false, _config.background_writer_scheduling_group).then([this, newtab, old] {
                return newtab->open_data();
            }).then([this, old, newtab] () {
-                add_sstable(newtab, {engine().cpu_id()});
-                trigger_compaction();
-                // Cache synchronization must be started atomically with add_sstable()
-                if (_config.enable_cache) {
-                    return _cache.update_invalidating(*old);
-                } else {
-                    return old->clear_gently();
-                }
+                return with_semaphore(_cache_update_sem, 1, [this, newtab, old] {
+                    add_sstable(newtab, {engine().cpu_id()});
+                    trigger_compaction();
+                    // Cache synchronization must be started atomically with add_sstable()
+                    if (_config.enable_cache) {
+                        return _cache.update_invalidating(*old);
+                    } else {
+                        return old->clear_gently();
+                    }
+                });
            }).handle_exception([old] (auto ep) {
                dblog.error("failed to write streamed sstable: {}", ep);
                return make_exception_future<>(ep);
@@ -937,7 +919,7 @@ future<> column_family::seal_active_streaming_memtable_big(streaming_memtable_bi
                newtab->set_unshared();

                auto&& priority = service::get_local_streaming_write_priority();
-                return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, true).then([this, newtab, old, &smb] {
+                return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, true, _config.background_writer_scheduling_group).then([this, newtab, old, &smb] {
                    smb.sstables.emplace_back(newtab);
                }).handle_exception([] (auto ep) {
                    dblog.error("failed to write streamed sstable: {}", ep);
@@ -955,34 +937,32 @@ column_family::seal_active_memtable(memtable_list::flush_behavior ignored) {

    if (old->empty()) {
        dblog.debug("Memtable is empty");
-        return make_ready_future<>();
+        return _flush_barrier.advance_and_await();
    }
    _memtables->add_memtable();
+    _stats.memtable_switch_count++;
+    auto previous_flush = _flush_barrier.advance_and_await();
+    auto op = _flush_barrier.start();

-    assert(_highest_flushed_rp < old->replay_position()
-    || (_highest_flushed_rp == db::replay_position() && old->replay_position() == db::replay_position())
-    );
-    _highest_flushed_rp = old->replay_position();
+    auto memtable_size = old->occupancy().total_space();

-    return _flush_queue->run_cf_flush(old->replay_position(), [old, this] {
-      auto memtable_size = old->occupancy().total_space();
+    _stats.pending_flushes++;
+    _config.cf_stats->pending_memtables_flushes_count++;
+    _config.cf_stats->pending_memtables_flushes_bytes += memtable_size;

-      _config.cf_stats->pending_memtables_flushes_count++;
-      _config.cf_stats->pending_memtables_flushes_bytes += memtable_size;
-
-      return repeat([this, old] {
+    return repeat([this, old] {
        return with_lock(_sstables_lock.for_read(), [this, old] {
-            _flush_queue->check_open_gate();
            return try_flush_memtable_to_sstable(old);
        });
-      }).then([this, memtable_size] {
+    }).then([this, memtable_size, old, op = std::move(op), previous_flush = std::move(previous_flush)] () mutable {
+        _stats.pending_flushes--;
        _config.cf_stats->pending_memtables_flushes_count--;
        _config.cf_stats->pending_memtables_flushes_bytes -= memtable_size;
-      });
-    }, [old, this] {
+
        if (_commitlog) {
            _commitlog->discard_completed_segments(_schema->id(), old->rp_set());
        }
+        return previous_flush.finally([op = std::move(op)] { });
    });
    // FIXME: release commit log
    // FIXME: provide back-pressure to upper layers
@@ -1011,7 +991,7 @@ column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
    // The code as is guarantees that we'll never partially backup a
    // single sstable, so that is enough of a guarantee.
    auto&& priority = service::get_local_memtable_flush_priority();
-    return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority).then([this, newtab, old] {
+    return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, false, _config.memtable_scheduling_group).then([this, newtab, old] {
        return newtab->open_data();
    }).then_wrapped([this, old, newtab] (future<> ret) {
        dblog.debug("Flushing to {} done", newtab->get_filename());
@@ -1067,9 +1047,7 @@ column_family::stop() {
    return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
        return _compaction_manager.remove(this).then([this] {
            // Nest, instead of using when_all, so we don't lose any exceptions.
-            return _flush_queue->close().then([this] {
-                return _streaming_flush_gate.close();
-            });
+            return _streaming_flush_gate.close();
        }).then([this] {
            return _sstable_deletion_gate.close();
        });
@@ -1209,17 +1187,17 @@ void column_family::set_metrics() {
    auto ks = keyspace_label(_schema->ks_name());
    namespace ms = seastar::metrics;
    _metrics.add_group("column_family", {
-            ms::make_histogram("read_latency", ms::description("Read latency histogram"), [this] {return _stats.estimated_read.get_histogram();})(cf)(ks),
-            ms::make_histogram("write_latency", ms::description("Write latency histogram"), [this] {return _stats.estimated_write.get_histogram();})(cf)(ks),
            ms::make_derive("memtable_switch", ms::description("Number of times flush has resulted in the memtable being switched out"), _stats.memtable_switch_count)(cf)(ks),
-            ms::make_gauge("pending_taks", ms::description("Estimated number of tasks pending for this column family"), _stats.pending_flushes)(cf)(ks),
+            ms::make_gauge("pending_tasks", ms::description("Estimated number of tasks pending for this column family"), _stats.pending_flushes)(cf)(ks),
            ms::make_gauge("live_disk_space", ms::description("Live disk space used"), _stats.live_disk_space_used)(cf)(ks),
            ms::make_gauge("total_disk_space", ms::description("Total disk space used"), _stats.total_disk_space_used)(cf)(ks),
            ms::make_gauge("live_sstable", ms::description("Live sstable count"), _stats.live_sstable_count)(cf)(ks),
            ms::make_gauge("pending_compaction", ms::description("Estimated number of compactions pending for this column family"), _stats.pending_compactions)(cf)(ks)
    });
-    if (_schema->ks_name() != db::system_keyspace::NAME) {
+    if (_schema->ks_name() != db::system_keyspace::NAME && _schema->ks_name() != db::schema_tables::v3::NAME && _schema->ks_name() != "system_traces") {
        _metrics.add_group("column_family", {
+                ms::make_histogram("read_latency", ms::description("Read latency histogram"), [this] {return _stats.estimated_read.get_histogram(std::chrono::microseconds(100));})(cf)(ks),
+                ms::make_histogram("write_latency", ms::description("Write latency histogram"), [this] {return _stats.estimated_write.get_histogram(std::chrono::microseconds(100));})(cf)(ks),
                ms::make_gauge("cache_hit_rate", ms::description("Cache hit rate"), [this] {return float(_global_cache_hit_rate);})(cf)(ks)
        });
    }
@@ -1311,6 +1289,10 @@ column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>&
            } catch (sstables::atomic_deletion_cancelled& adc) {
                dblog.debug("Failed to delete sstables after compaction: {}", adc);
            }
+        }).then([this] {
+            // refresh underlying data source in row cache to prevent it from holding reference
+            // to sstables files which were previously deleted.
+            _cache.refresh_snapshot();
        });
    });
 }
@@ -1366,7 +1348,7 @@ column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool
                return sst;
        };
        return sstables::compact_sstables(*sstables_to_compact, *this, create_sstable, descriptor.max_sstable_bytes, descriptor.level,
-                cleanup).then([this, sstables_to_compact] (auto new_sstables) {
+                cleanup, _config.background_writer_scheduling_group).then([this, sstables_to_compact] (auto new_sstables) {
            _compaction_strategy.notify_completion(*sstables_to_compact, new_sstables);
            return this->rebuild_sstable_list(new_sstables, *sstables_to_compact);
        });
@@ -1374,7 +1356,7 @@ column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool
 }

 static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,
-                   const lw_shared_ptr<dht::token_range_vector>& owned_ranges,
+                   const dht::token_range_vector& owned_ranges,
                   schema_ptr s) {
    auto first = sst->get_first_partition_key();
    auto last = sst->get_last_partition_key();
@@ -1383,7 +1365,7 @@ static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,
    dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);

    // return true iff sst partition range isn't fully contained in any of the owned ranges.
-    for (auto& r : *owned_ranges) {
+    for (auto& r : owned_ranges) {
        if (r.contains(sst_token_range, dht::token_comparator())) {
            return false;
        }
@@ -1393,11 +1375,10 @@ static bool needs_cleanup(const lw_shared_ptr<sstables::sstable>& sst,

 future<> column_family::cleanup_sstables(sstables::compaction_descriptor descriptor) {
    dht::token_range_vector r = service::get_local_storage_service().get_local_ranges(_schema->ks_name());
-    auto owned_ranges = make_lw_shared<dht::token_range_vector>(std::move(r));
-    auto sstables_to_cleanup = make_lw_shared<std::vector<sstables::shared_sstable>>(std::move(descriptor.sstables));

-    return do_for_each(*sstables_to_cleanup, [this, owned_ranges = std::move(owned_ranges), sstables_to_cleanup] (auto& sst) {
-        if (!owned_ranges->empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
+  return do_with(std::move(descriptor.sstables), std::move(r), [this] (auto& sstables, auto& owned_ranges) {
+    return do_for_each(sstables, [this, &owned_ranges] (auto& sst) {
+        if (!owned_ranges.empty() && !needs_cleanup(sst, owned_ranges, _schema)) {
           return make_ready_future<>();
        }

@@ -1411,6 +1392,7 @@ future<> column_family::cleanup_sstables(sstables::compaction_descriptor descrip
            return this->compact_sstables(sstables::compaction_descriptor({ sst }, sst->get_sstable_level()), true);
        });
    });
+  });
 }

 // FIXME: this is just an example, should be changed to something more general
@@ -1733,7 +1715,7 @@ void distributed_loader::reshard(distributed<database>& db, sstring ks_name, sst
                        gc_clock::now(), default_io_error_handler_gen());
                    return sst;
                };
-                auto f = sstables::reshard_sstables(sstables, *cf, creator, max_sstable_bytes, level);
+                auto f = sstables::reshard_sstables(sstables, *cf, creator, max_sstable_bytes, level, cf->background_writer_scheduling_group());

                return f.then([&cf, sstables = std::move(sstables)] (std::vector<sstables::shared_sstable> new_sstables) mutable {
                    // an input sstable may belong to shard 1 and 2 and only have data which
@@ -1805,15 +1787,17 @@ future<> distributed_loader::load_new_sstables(distributed<database>& db, sstrin
    }).then([&db, ks, cf] {
        return db.invoke_on_all([ks = std::move(ks), cfname = std::move(cf)] (database& db) {
            auto& cf = db.find_column_family(ks, cfname);
-            // atomically load all opened sstables into column family.
-            for (auto& sst : cf._sstables_opened_but_not_loaded) {
-                cf.load_sstable(sst, true);
-            }
-            cf._sstables_opened_but_not_loaded.clear();
-            cf.trigger_compaction();
-            // Drop entire cache for this column family because it may be populated
-            // with stale data.
-            return cf.get_row_cache().invalidate();
+            return with_semaphore(cf._cache_update_sem, 1, [&cf] {
+                // atomically load all opened sstables into column family.
+                for (auto& sst : cf._sstables_opened_but_not_loaded) {
+                    cf.load_sstable(sst, true);
+                }
+                cf._sstables_opened_but_not_loaded.clear();
+                cf.trigger_compaction();
+                // Drop entire cache for this column family because it may be populated
+                // with stale data.
+                return cf.get_row_cache().invalidate();
+            });
        });
    }).then([&db, ks, cf] () mutable {
        return smp::submit_to(0, [&db, ks = std::move(ks), cf = std::move(cf)] () mutable {
@@ -1989,6 +1973,15 @@ future<> distributed_loader::populate_column_family(distributed<database>& db, s

 }

+inline
+flush_cpu_controller
+make_flush_cpu_controller(db::config& cfg, seastar::thread_scheduling_group* backup, std::function<double()> fn) {
+    if (cfg.auto_adjust_flush_quota()) {
+        return flush_cpu_controller(250ms, cfg.virtual_dirty_soft_limit(), std::move(fn));
+    }
+    return flush_cpu_controller(flush_cpu_controller::disabled{backup});
+}
+
 utils::UUID database::empty_version = utils::UUID_gen::get_name_UUID(bytes{});

 database::database() : database(db::config())
@@ -2002,6 +1995,10 @@ database::database(const db::config& cfg)
    , _system_dirty_memory_manager(*this, 10 << 20, cfg.virtual_dirty_soft_limit())
    , _dirty_memory_manager(*this, memory::stats().total_memory() * 0.45, cfg.virtual_dirty_soft_limit())
    , _streaming_dirty_memory_manager(*this, memory::stats().total_memory() * 0.10, cfg.virtual_dirty_soft_limit())
+    , _background_writer_scheduling_group(1ms, _cfg->background_writer_scheduling_quota())
+    , _memtable_cpu_controller(make_flush_cpu_controller(*_cfg, &_background_writer_scheduling_group, [this, limit = 2.0f * _dirty_memory_manager.throttle_threshold()] {
+        return (_dirty_memory_manager.virtual_dirty_memory()) / limit;
+    }))
    , _version(empty_version)
    , _enable_incremental_backups(cfg.incremental_backups())
 {
@@ -2011,6 +2008,32 @@ database::database(const db::config& cfg)
    dblog.info("Row: max_vector_size: {}, internal_count: {}", size_t(row::max_vector_size), size_t(row::internal_count));
 }

+void flush_cpu_controller::adjust() {
+    auto mid = _goal + (hard_dirty_limit - _goal) / 2;
+
+    auto dirty = _current_dirty();
+    if (dirty < _goal) {
+        _current_quota = dirty * q1 / _goal;
+    } else if ((dirty >= _goal) && (dirty < mid)) {
+        _current_quota = q1 + (dirty - _goal) * (q2 - q1)/(mid - _goal);
+    } else {
+        _current_quota = q2 + (dirty - mid) * (qmax - q2) / (hard_dirty_limit - mid);
+    }
+
+    dblog.trace("dirty {}, goal {}, mid {} quota {}", dirty, _goal, mid, _current_quota);
+    _scheduling_group.update_usage(_current_quota);
+}
+
+flush_cpu_controller::flush_cpu_controller(std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
+    : _goal(soft_limit / 2)
+    , _current_dirty(std::move(current_dirty))
+    , _interval(interval)
+    , _update_timer([this] { adjust(); })
+    , _scheduling_group(1ms, 0.0f)
+    , _current_scheduling_group(&_scheduling_group)
+{
+    _update_timer.arm_periodic(_interval);
+}

 void
 dirty_memory_manager::setup_collectd(sstring namestr) {
@@ -2108,6 +2131,14 @@ database::setup_metrics() {
        sm::make_gauge("queued_reads", [this] { return _read_concurrency_sem.waiters(); },
                       sm::description("Holds the number of currently queued read operations.")),

+        sm::make_gauge("active_reads_streaming", [this] { return max_streaming_concurrent_reads() - _streaming_concurrency_sem.current(); },
+                       sm::description(seastar::format("Holds the number of currently active read operations issued on behalf of streaming "
+                                                       "If this value gets close to {} we are likely to start dropping new read requests. "
+                                                       "In that case sstable_read_queue_overloads is going to get a non-zero value.", max_streaming_concurrent_reads()))),
+
+        sm::make_gauge("queued_reads_streaming", [this] { return _streaming_concurrency_sem.waiters(); },
+                       sm::description("Holds the number of currently queued read operations on behalf of streaming.")),
+
        sm::make_gauge("active_reads_system_keyspace", [this] { return max_system_concurrent_reads() - _system_read_concurrency_sem.current(); },
                       sm::description(seastar::format("Holds the number of currently active read operations from \"system\" keyspace tables. "
                                                       "If this vlaue gets close to {} we are likely to start dropping new read requests. "
@@ -2119,6 +2150,9 @@ database::setup_metrics() {
        sm::make_gauge("total_result_bytes", [this] { return get_result_memory_limiter().total_used_memory(); },
                       sm::description("Holds the current amount of memory used for results.")),

+        sm::make_gauge("cpu_flush_quota", [this] { return _memtable_cpu_controller.current_quota(); },
+                             sm::description("The current quota for memtable CPU scheduling group")),
+
        sm::make_derive("short_data_queries", _stats->short_data_queries,
                       sm::description("The rate of data queries (data or digest reads) that returned less rows than requested due to result size limiting.")),

@@ -2330,7 +2364,7 @@ database::init_commitlog() {
                _commitlog->discard_completed_segments(id);
                return;
            }
-            _column_families[id]->flush(pos);
+            _column_families[id]->flush();
        }).release(); // we have longer life time than CL. Ignore reg anchor
    });
 }
@@ -2444,12 +2478,12 @@ void database::remove(const column_family& cf) {
    }
 }

-future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func tsf) {
+future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func tsf, bool snapshot) {
    auto uuid = find_uuid(ks_name, cf_name);
    auto cf = _column_families.at(uuid);
    remove(*cf);
    auto& ks = find_keyspace(ks_name);
-    return truncate(ks, *cf, std::move(tsf)).then([this, cf] {
+    return truncate(ks, *cf, std::move(tsf), snapshot).then([this, cf] {
        return cf->stop();
    }).then([this, cf] {
        return make_ready_future<>();
@@ -2589,6 +2623,8 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
    cfg.streaming_read_concurrency_config = _config.streaming_read_concurrency_config;
    cfg.cf_stats = _config.cf_stats;
    cfg.enable_incremental_backups = _config.enable_incremental_backups;
+    cfg.background_writer_scheduling_group = _config.background_writer_scheduling_group;
+    cfg.memtable_scheduling_group = _config.memtable_scheduling_group;

    return cfg;
 }
@@ -3035,7 +3071,7 @@ void column_family::apply_streaming_big_mutation(schema_ptr m_schema, utils::UUI
 void
 column_family::check_valid_rp(const db::replay_position& rp) const {
    if (rp != db::replay_position() && rp < _lowest_allowed_rp) {
-        throw replay_position_reordered_exception();
+        throw mutation_reordered_with_truncate_exception();
    }
 }

@@ -3079,10 +3115,6 @@ lw_shared_ptr<memtable> memtable_list::new_memtable() {
 }

 future<> dirty_memory_manager::flush_one(memtable_list& mtlist, semaphore_units<> permit) {
-    if (mtlist.back()->empty()) {
-        return make_ready_future<>();
-    }
-
    auto* region = &(mtlist.back()->region());
    auto schema = mtlist.back()->schema();

@@ -3185,25 +3217,24 @@ future<mutation> database::apply_counter_update(schema_ptr s, const frozen_mutat
    }
 }

+static future<> maybe_handle_reorder(std::exception_ptr exp) {
+    try {
+        std::rethrow_exception(exp);
+        return make_exception_future(exp);
+    } catch (mutation_reordered_with_truncate_exception&) {
+        // This mutation raced with a truncate, so we can just drop it.
+        dblog.debug("replay_position reordering detected");
+        return make_ready_future<>();
+    }
+}
+
 future<> database::apply_with_commitlog(column_family& cf, const mutation& m, timeout_clock::time_point timeout) {
    if (cf.commitlog() != nullptr) {
        return do_with(freeze(m), [this, &m, &cf, timeout] (frozen_mutation& fm) {
            commitlog_entry_writer cew(m.schema(), fm);
            return cf.commitlog()->add_entry(m.schema()->id(), cew, timeout);
        }).then([this, &m, &cf, timeout] (db::rp_handle h) {
-            return apply_in_memory(m, cf, std::move(h), timeout).handle_exception([this, &cf, &m, timeout] (auto ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (replay_position_reordered_exception&) {
-                    // expensive, but we're assuming this is super rare.
-                    // if we failed to apply the mutation due to future re-ordering
-                    // (which should be the ever only reason for rp mismatch in CF)
-                    // let's just try again, add the mutation to the CL once more,
-                    // and assume success in inevitable eventually.
-                    dblog.debug("replay_position reordering detected");
-                    return this->apply_with_commitlog(cf, m, timeout);
-                }
-            });
+            return apply_in_memory(m, cf, std::move(h), timeout).handle_exception(maybe_handle_reorder);
        });
    }
    return apply_in_memory(m, cf, {}, timeout);
@@ -3214,19 +3245,7 @@ future<> database::apply_with_commitlog(schema_ptr s, column_family& cf, utils::
    if (cl != nullptr) {
        commitlog_entry_writer cew(s, m);
        return cf.commitlog()->add_entry(uuid, cew, timeout).then([&m, this, s, timeout, cl](db::rp_handle h) {
-            return this->apply_in_memory(m, s, std::move(h), timeout).handle_exception([this, s, &m, timeout] (auto ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (replay_position_reordered_exception&) {
-                    // expensive, but we're assuming this is super rare.
-                    // if we failed to apply the mutation due to future re-ordering
-                    // (which should be the ever only reason for rp mismatch in CF)
-                    // let's just try again, add the mutation to the CL once more,
-                    // and assume success in inevitable eventually.
-                    dblog.debug("replay_position reordering detected");
-                    return this->apply(s, m, timeout);
-                }
-            });
+            return this->apply_in_memory(m, s, std::move(h), timeout).handle_exception(maybe_handle_reorder);
        });
    }
    return apply_in_memory(m, std::move(s), {}, timeout);
@@ -3317,10 +3336,17 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        ++_stats->sstable_read_queue_overloaded;
        throw std::runtime_error("sstable inactive read queue overloaded");
    };
-    cfg.streaming_read_concurrency_config = cfg.read_concurrency_config;
-    cfg.streaming_read_concurrency_config.timeout = {};
+    // No timeouts or queue length limits - a failure here can kill an entire repair.
+    // Trust the caller to limit concurrency.
+    cfg.streaming_read_concurrency_config.sem = &_streaming_concurrency_sem;
    cfg.cf_stats = &_cf_stats;
    cfg.enable_incremental_backups = _enable_incremental_backups;
+
+    if (_cfg->background_writer_scheduling_quota() < 1.0f) {
+        cfg.background_writer_scheduling_group = &_background_writer_scheduling_group;
+        cfg.memtable_scheduling_group = _memtable_cpu_controller.scheduling_group();
+    }
+
    return cfg;
 }

@@ -3444,10 +3470,10 @@ future<> database::truncate(sstring ksname, sstring cfname, timestamp_func tsf)
    return truncate(ks, cf, std::move(tsf));
 }

-future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_func tsf)
+future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_func tsf, bool with_snapshot)
 {
    const auto durable = ks.metadata()->durable_writes();
-    const auto auto_snapshot = get_config().auto_snapshot();
+    const auto auto_snapshot = with_snapshot && get_config().auto_snapshot();

    // Force mutations coming in to re-acquire higher rp:s
    // This creates a "soft" ordering, in that we will guarantee that
@@ -3774,35 +3800,6 @@ future<std::unordered_map<sstring, column_family::snapshot_details>> column_fami
 }

 future<> column_family::flush() {
-    _stats.pending_flushes++;
-
-    // highest_flushed_rp is only updated when we flush. If the memtable is currently alive, then
-    // the most up2date replay position is the one that's in there now. Otherwise, if the memtable
-    // hasn't received any writes yet, that's the one from the last flush we made.
-    auto desired_rp = _memtables->back()->empty() ? _highest_flushed_rp : _memtables->back()->replay_position();
-    return _memtables->request_flush().finally([this, desired_rp] {
-        _stats.pending_flushes--;
-        // In origin memtable_switch_count is incremented inside
-        // ColumnFamilyMeetrics Flush.run
-        _stats.memtable_switch_count++;
-        // wait for all up until us.
-        return _flush_queue->wait_for_pending(desired_rp);
-    });
-}
-
-future<> column_family::flush(const db::replay_position& pos) {
-    // Technically possible if we've already issued the
-    // sstable write, but it is not done yet.
-    if (pos < _highest_flushed_rp) {
-        return make_ready_future<>();
-    }
-
-    // TODO: Origin looks at "secondary" memtables
-    // It also consideres "minReplayPosition", which is simply where
-    // the CL "started" (the first ever RP in this run).
-    // We ignore this for now and just say that if we're asked for
-    // a CF and it exists, we pretty much have to have data that needs
-    // flushing. Let's do it.
    return _memtables->request_flush();
 }

@@ -3824,12 +3821,14 @@ future<> column_family::flush_streaming_mutations(utils::UUID plan_id, dht::part
            return _streaming_memtables->seal_active_memtable(memtable_list::flush_behavior::delayed).then([this] {
                return _streaming_flush_phaser.advance_and_await();
            }).then([this, sstables = std::move(sstables), ranges = std::move(ranges)] () mutable {
-                for (auto&& sst : sstables) {
-                    // seal_active_streaming_memtable_big() ensures sst is unshared.
-                    this->add_sstable(sst, {engine().cpu_id()});
-                }
-                this->trigger_compaction();
-                return _cache.invalidate(std::move(ranges));
+                return with_semaphore(_cache_update_sem, 1, [this, sstables = std::move(sstables), ranges = std::move(ranges)] () mutable {
+                    for (auto&& sst : sstables) {
+                        // seal_active_streaming_memtable_big() ensures sst is unshared.
+                        this->add_sstable(sst, {engine().cpu_id()});
+                    }
+                    this->trigger_compaction();
+                    return _cache.invalidate(std::move(ranges));
+                });
            });
        });
    });
@@ -4119,11 +4118,12 @@ void column_family::drop_hit_rate(gms::inet_address addr) {
 }

 future<>
-write_memtable_to_sstable(memtable& mt, sstables::shared_sstable sst, bool backup, const io_priority_class& pc, bool leave_unsealed) {
+write_memtable_to_sstable(memtable& mt, sstables::shared_sstable sst, bool backup, const io_priority_class& pc, bool leave_unsealed, seastar::thread_scheduling_group *tsg) {
    sstables::sstable_writer_config cfg;
    cfg.replay_position = mt.replay_position();
    cfg.backup = backup;
    cfg.leave_unsealed = leave_unsealed;
+    cfg.thread_scheduling_group = tsg;
    return sst->write_components(mt.make_flush_reader(mt.schema(), pc), mt.partition_count(), mt.schema(), cfg, pc);
 }

--- a/database.hh
+++ b/database.hh
@@ -77,6 +77,8 @@
 #include <boost/intrusive/parent_from_member.hpp>
 #include "db/view/view.hh"
 #include "lister.hh"
+#include "utils/phased_barrier.hh"
+#include "cpu_controller.hh"

 class cell_locker;
 class cell_locker_stats;
@@ -114,7 +116,7 @@ void make(database& db, bool durable, bool volatile_testing_only);
 }
 }

-class replay_position_reordered_exception : public std::exception {};
+class mutation_reordered_with_truncate_exception : public std::exception {};

 using shared_memtable = lw_shared_ptr<memtable>;
 class memtable_list;
@@ -429,6 +431,8 @@ public:
        restricted_mutation_reader_config read_concurrency_config;
        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
+        seastar::thread_scheduling_group* background_writer_scheduling_group = nullptr;
+        seastar::thread_scheduling_group* memtable_scheduling_group = nullptr;
    };
    struct no_commitlog {};
    struct stats {
@@ -538,7 +542,6 @@ private:
    mutable row_cache _cache; // Cache covers only sstables.
    std::experimental::optional<int64_t> _sstable_generation = {};

-    db::replay_position _highest_flushed_rp;
    db::replay_position _highest_rp;
    db::replay_position _lowest_allowed_rp;

@@ -546,15 +549,7 @@ private:
    db::commitlog* _commitlog;
    compaction_manager& _compaction_manager;
    int _compaction_disabled = 0;
-    class memtable_flush_queue;
-    std::unique_ptr<memtable_flush_queue> _flush_queue;
-    // Because streaming mutations bypass the commitlog, there is
-    // no need for the complications of the flush queue. Besides, it
-    // is easier to just use a common gate than it is to modify the flush_queue
-    // to work both with and without a replay position.
-    //
-    // Last but not least, we seldom need to guarantee any ordering here: as long
-    // as all data is waited for, we're good.
+    utils::phased_barrier _flush_barrier;
    seastar::gate _streaming_flush_gate;
    std::vector<view_ptr> _views;
    semaphore _cache_update_sem{1};
@@ -753,7 +748,6 @@ public:
    void start();
    future<> stop();
    future<> flush();
-    future<> flush(const db::replay_position&);
    future<> flush_streaming_mutations(utils::UUID plan_id, dht::partition_range_vector ranges = dht::partition_range_vector{});
    future<> fail_streaming_mutations(utils::UUID plan_id);
    future<> clear(); // discards memtable(s) without flushing them to disk.
@@ -864,6 +858,10 @@ public:
        return _config.cf_stats;
    }

+    seastar::thread_scheduling_group* background_writer_scheduling_group() {
+        return _config.background_writer_scheduling_group;
+    }
+
    compaction_manager& get_compaction_manager() const {
        return _compaction_manager;
    }
@@ -1072,6 +1070,8 @@ public:
        restricted_mutation_reader_config read_concurrency_config;
        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
+        seastar::thread_scheduling_group* background_writer_scheduling_group = nullptr;
+        seastar::thread_scheduling_group* memtable_scheduling_group = nullptr;
    };
 private:
    std::unique_ptr<locator::abstract_replication_strategy> _replication_strategy;
@@ -1154,6 +1154,7 @@ public:
 private:
    ::cf_stats _cf_stats;
    static constexpr size_t max_concurrent_reads() { return 100; }
+    static constexpr size_t max_streaming_concurrent_reads() { return 10; } // They're rather heavyweight, so limit more
    static constexpr size_t max_system_concurrent_reads() { return 10; }
    static constexpr size_t max_concurrent_sstable_loads() { return 3; }
    struct db_stats {
@@ -1177,7 +1178,11 @@ private:
    dirty_memory_manager _dirty_memory_manager;
    dirty_memory_manager _streaming_dirty_memory_manager;

+    seastar::thread_scheduling_group _background_writer_scheduling_group;
+    flush_cpu_controller _memtable_cpu_controller;
+
    semaphore _read_concurrency_sem{max_concurrent_reads()};
+    semaphore _streaming_concurrency_sem{max_streaming_concurrent_reads()};
    restricted_mutation_reader_config _read_concurrency_config;
    semaphore _system_read_concurrency_sem{max_system_concurrent_reads()};
    restricted_mutation_reader_config _system_read_concurrency_config;
@@ -1332,10 +1337,10 @@ public:

    /** Truncates the given column family */
    future<> truncate(sstring ksname, sstring cfname, timestamp_func);
-    future<> truncate(const keyspace& ks, column_family& cf, timestamp_func);
+    future<> truncate(const keyspace& ks, column_family& cf, timestamp_func, bool with_snapshot = true);

    bool update_column_family(schema_ptr s);
-    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func);
+    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func, bool with_snapshot = true);
    void remove(const column_family&);

    const logalloc::region_group& dirty_memory_region_group() const {
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -511,6 +511,7 @@ public:
        if (shutdown) {
            auto me = shared_from_this();
            return _gate.close().then([me] {
+                me->_closed = true;
                return me->sync().finally([me] {
                    // When we get here, nothing should add ops,
                    // and we should have waited out all pending.
@@ -1319,6 +1320,7 @@ future<> db::commitlog::segment_manager::shutdown() {
                return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true));
            });
        }).finally([this] {
+            discard_unused_segments();
            // Now that the gate is closed and requests completed we are sure nobody else will pop()
            return clear_reserve_segments().finally([this] {
                return std::move(_reserve_replenisher).then_wrapped([this] (auto f) {
--- a/db/config.hh
+++ b/db/config.hh
@@ -166,6 +166,12 @@ public:
     */

 #define _make_config_values(val)                \
+    val(background_writer_scheduling_quota, double, 1.0, Used, \
+            "max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5." \
+    )   \
+    val(auto_adjust_flush_quota, bool, false, Used, \
+            "true: auto-adjust quota for flush processes. false: put everyone together in the static background writer group - if background writer group is enabled. Not intended for setting in normal operations" \
+    )   \
    /* Initialization properties */             \
    /* The minimal properties needed for configuring a cluster. */  \
    val(cluster_name, sstring, "Test Cluster", Used,   \
@@ -330,7 +336,7 @@ public:
    val(sstable_preemptive_open_interval_in_mb, uint32_t, 50, Unused,     \
            "When compacting, the replacement opens SSTables before they are completely written and uses in place of the prior SSTables for any range previously written. This setting helps to smoothly transfer reads between the SSTables by reducing page cache churn and keeps hot rows hot."  \
    )                                                   \
-    val(defragment_memory_on_idle, bool, true, Used, "Set to true to defragment memory when the cpu is idle.  This reduces the amount of work Scylla performs when processing client requests.") \
+    val(defragment_memory_on_idle, bool, false, Used, "When set to true, will defragment memory when the cpu is idle.  This reduces the amount of work Scylla performs when processing client requests.") \
    /* Memtable settings */ \
    val(memtable_allocation_type, sstring, "heap_buffers", Invalid,     \
            "Specify the way Cassandra allocates and manages memtable memory. See Off-heap memtables in Cassandra 2.1. Options are:\n"  \
@@ -754,6 +760,8 @@ public:
    val(replace_address_first_boot, sstring, "", Used, "Like replace_address option, but if the node has been bootstrapped successfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.") \
    val(override_decommission, bool, false, Used, "Set true to force a decommissioned node to join the cluster") \
    val(ring_delay_ms, uint32_t, 30 * 1000, Used, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.") \
+    val(fd_max_interval_ms, uint32_t, 2 * 1000, Used, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.") \
+    val(fd_initial_value_ms, uint32_t, 2 * 1000, Used, "The initial failure_detector interval time in milliseconds.") \
    val(shutdown_announce_in_ms, uint32_t, 2 * 1000, Used, "Time a node waits after sending gossip shutdown message in milliseconds. Same as -Dcassandra.shutdown_announce_in_ms in cassandra.") \
    val(developer_mode, bool, false, Used, "Relax environment checks. Setting to true can reduce performance and reliability significantly.") \
    val(skip_wait_for_gossip_to_settle, int32_t, -1, Used, "An integer to configure the wait for gossip to settle. -1: wait normally, 0: do not wait at all, n: wait for at most n polls. Same as -Dcassandra.skip_wait_for_gossip_to_settle in cassandra.") \
--- a/db/consistency_level.hh
+++ b/db/consistency_level.hh
@@ -162,6 +162,14 @@ inline void assure_sufficient_live_nodes(
        const PendingRange& pending_endpoints = std::array<gms::inet_address, 0>()) {
    size_t need = block_for(ks, cl);

+    auto adjust_live_for_error = [] (size_t live, size_t pending) {
+        // DowngradingConsistencyRetryPolicy uses alive replicas count from Unavailable
+        // exception to adjust CL for retry. When pending node is present CL is increased
+        // by 1 internally, so reported number of live nodes has to be adjusted to take
+        // this into account
+        return pending <= live ? live - pending : 0;
+    };
+
    switch (cl) {
    case consistency_level::ANY:
        // local hint is acceptable, and local node is always live
@@ -176,7 +184,7 @@ inline void assure_sufficient_live_nodes(
        size_t pending = count_local_endpoints(pending_endpoints);
        if (local_live < need + pending) {
            cl_logger.debug("Local replicas {} are insufficient to satisfy LOCAL_QUORUM requirement of needed {} and pending {}", live_endpoints, local_live, pending);
-            throw exceptions::unavailable_exception(cl, need, local_live);
+            throw exceptions::unavailable_exception(cl, need, adjust_live_for_error(local_live, pending));
        }
        break;
    }
@@ -190,7 +198,7 @@ inline void assure_sufficient_live_nodes(
        size_t pending = pending_endpoints.size();
        if (live < need + pending) {
            cl_logger.debug("Live nodes {} do not satisfy ConsistencyLevel ({} required, {} pending)", live, need, pending);
-            throw exceptions::unavailable_exception(cl, need, live);
+            throw exceptions::unavailable_exception(cl, need, adjust_live_for_error(live, pending));
        }
        break;
    }
--- a/db/legacy_schema_migrator.cc
+++ b/db/legacy_schema_migrator.cc
@@ -66,8 +66,8 @@ class migrator {
 public:
    static const std::unordered_set<sstring> legacy_schema_tables;

-    migrator(cql3::query_processor& qp)
-                    : _qp(qp) {
+    migrator(sharded<service::storage_proxy>& sp, cql3::query_processor& qp)
+                    : _sp(sp), _qp(qp) {
    }
    migrator(migrator&&) = default;

@@ -147,15 +147,18 @@ public:
        auto cq = fmt_query(fmt, db::system_keyspace::legacy::COLUMNS);
        auto zq = fmt_query(fmt, db::system_keyspace::legacy::TRIGGERS);

-        typedef std::tuple<future<result_set_type>, future<result_set_type>, future<result_set_type>> result_tuple;
+        typedef std::tuple<future<result_set_type>, future<result_set_type>, future<result_set_type>, future<db::schema_tables::legacy::schema_mutations>> result_tuple;

        return when_all(_qp.execute_internal(tq, { dst.name, cf_name }),
                        _qp.execute_internal(cq, { dst.name, cf_name }),
-                        _qp.execute_internal(zq, { dst.name, cf_name })).then([this, &dst, cf_name, timestamp](result_tuple&& t) {
+                        _qp.execute_internal(zq, { dst.name, cf_name }),
+                        db::schema_tables::legacy::read_table_mutations(_sp, dst.name, cf_name, db::system_keyspace::legacy::column_families()))
+                    .then([this, &dst, cf_name, timestamp](result_tuple&& t) {

            result_set_type tables = std::get<0>(t).get0();
            result_set_type columns = std::get<1>(t).get0();
            result_set_type triggers = std::get<2>(t).get0();
+            db::schema_tables::legacy::schema_mutations sm = std::get<3>(t).get0();

            row_type& td = tables->one();

@@ -165,6 +168,8 @@ public:

            schema_builder builder(dst.name, cf_name, id);

+            builder.with_version(sm.digest());
+
            cf_type cf = sstring_to_cf_type(td.get_or("type", sstring("standard")));
            if (cf == cf_type::super) {
                fail(unimplemented::cause::SUPER);
@@ -183,6 +188,7 @@ public:
                if (default_validator->is_counter()) {
                    builder.set_is_counter(true);
                }
+                builder.set_default_validation_class(default_validator);
            }

            /*
@@ -191,10 +197,8 @@ public:
             * but we can trust is_dense value of false.
             */
            auto is_dense = td.get_opt<bool>("is_dense");
-            if (is_dense && !*is_dense) {
-                builder.set_is_dense(false);
-            } else {
-                auto calulated_is_dense = [&] {
+            if (!is_dense || *is_dense) {
+                is_dense = [&] {
                    /*
                     * As said above, this method is only here because we need to deal with thrift upgrades.
                     * Once a CF has been "upgraded", i.e. we've rebuilt and save its CQL3 metadata at least once,
@@ -252,40 +256,48 @@ public:
                        return comparator.compare(off, end - off, utf8_type->name()) == 0;
                    };

-                    if (regular) {
-                        auto name = regular->get_or("column_name", bytes());
-                        // This is a lame attempt at determining if this was in fact a compact_value column
-                        if (!max_cl_idx || (!name.empty() && name != to_bytes("value"))
-                                        || db::schema_tables::parse_type(regular->get_as<sstring>("type")) != default_validator) {
-                            return false;
-                        }
-                        // Ok, we will assume this was in fact a (scylla-created) compact value.
-                    }
-
                    if (max_cl_idx) {
                        auto n = std::count(comparator.begin(), comparator.end(), ','); // num comp - 1
                        return *max_cl_idx == n;
                    }

+                    if (regular) {
+                        return false;
+                    }
+
                    return !is_cql3_only_pk_comparator(comparator);

                }();

-                builder.set_is_dense(calulated_is_dense);
-
                // now, if switched to sparse, remove redundant compact_value column and the last clustering column,
                // directly copying CASSANDRA-11502 logic. See CASSANDRA-11315.

-                filter_sparse = !calulated_is_dense && is_dense.value_or(true);
+                filter_sparse = !*is_dense;
            }
+            builder.set_is_dense(*is_dense);
+
+            auto is_cql = !*is_dense && is_compound;
+            auto is_static_compact = !*is_dense && !is_compound;
+
+            // org.apache.cassandra.schema.LegacySchemaMigrator#isEmptyCompactValueColumn
+            auto is_empty_compact_value = [](const cql3::untyped_result_set::row& column_row) {
+                auto kind_str = column_row.get_as<sstring>("type");
+                // Cassandra only checks for "compact_value", but Scylla generates "regular" instead (#2586)
+                return (kind_str == "compact_value" || kind_str == "regular")
+                       && column_row.get_as<sstring>("column_name").empty();
+            };

            for (auto& row : *columns) {
                auto kind_str = row.get_as<sstring>("type");
                auto kind = db::schema_tables::deserialize_kind(kind_str);
                auto component_index = kind > column_kind::clustering_key ? 0 : column_id(row.get_or("component_index", 0));
-                auto name = row.get_or("column_name", bytes());
+                auto name = row.get_or<sstring>("column_name", sstring());
                auto validator = db::schema_tables::parse_type(row.get_as<sstring>("validator"));

+                if (is_empty_compact_value(row)) {
+                    continue;
+                }
+
                if (filter_sparse) {
                    if (kind_str == "compact_value") {
                        continue;
@@ -329,7 +341,7 @@ public:
                            type = "VALUES";
                        }
                    }
-                    auto column = cql3::util::maybe_quote(utf8_type->to_string(name));
+                    auto column = cql3::util::maybe_quote(name);
                    options["target"] = validator->is_collection()
                                    ? type + "(" + column + ")"
                                    : column;
@@ -339,7 +351,26 @@ public:
                    builder.with_index(index_metadata(index_name, options, *index_kind));
                }

-                builder.with_column(std::move(name), std::move(validator), kind, component_index);
+                data_type column_name_type = [&] {
+                    if (is_static_compact && kind == column_kind::regular_column) {
+                        return db::schema_tables::parse_type(comparator);
+                    }
+                    return utf8_type;
+                }();
+                auto column_name = [&] {
+                    try {
+                        return column_name_type->from_string(name);
+                    } catch (marshal_exception) {
+                        // #2597: Scylla < 2.0 writes names in serialized form, try to recover
+                        column_name_type->validate(to_bytes_view(name));
+                        return to_bytes(name);
+                    }
+                }();
+                builder.with_column(std::move(column_name), std::move(validator), kind, component_index);
+            }
+
+            if (is_static_compact) {
+                builder.set_regular_column_name_type(db::schema_tables::parse_type(comparator));
            }

            if (td.has("read_repair_chance")) {
@@ -414,8 +445,6 @@ public:
                throw unsupported_feature("triggers");
            }

-            // TODO: table upgrades as in origin converter.
-
            dst.tables.emplace_back(table{timestamp, builder.build() });
        });
    }
@@ -517,21 +546,13 @@ public:
        });
    }

-    future<> unload_legacy_tables() {
-        return _qp.db().invoke_on_all([](database& db) {
-            for (auto& cfname : legacy_schema_tables) {
-                auto& cf = db.find_column_family(db::system_keyspace::NAME, cfname);
-                db.remove(cf);
-            }
-        });
-    }
-
-    future<> truncate_legacy_tables() {
-        mlogger.info("Truncating legacy schema tables");
-        return do_with(utils::make_joinpoint([] { return db_clock::now();}),[this](auto& tsf) {
-            return _qp.db().invoke_on_all([&tsf](database& db) {
-                return parallel_for_each(legacy_schema_tables, [&db, &tsf](const sstring& cfname) {
-                    return db.truncate(db::system_keyspace::NAME, cfname, [&tsf] { return tsf.value(); });
+    future<> drop_legacy_tables() {
+        mlogger.info("Dropping legacy schema tables");
+        return parallel_for_each(legacy_schema_tables, [this](const sstring& cfname) {
+            return do_with(utils::make_joinpoint([] { return db_clock::now();}),[this, cfname](auto& tsf) {
+                auto with_snapshot = !_keyspaces.empty();
+                return _qp.db().invoke_on_all([&tsf, cfname, with_snapshot](database& db) {
+                    return db.drop_column_family(db::system_keyspace::NAME, cfname, [&tsf] { return tsf.value(); }, with_snapshot);
                });
            });
        });
@@ -590,18 +611,15 @@ public:

    future<> migrate() {
        return read_all_keyspaces().then([this]() {
-            if (_keyspaces.empty()) {
-                return unload_legacy_tables();
-            }
            // write metadata to the new schema tables
            return store_keyspaces_in_new_schema_tables().then(std::bind(&migrator::migrate_indexes, this))
                                                .then(std::bind(&migrator::flush_schemas, this))
-                                                .then(std::bind(&migrator::truncate_legacy_tables, this))
-                                                .then(std::bind(&migrator::unload_legacy_tables, this))
+                                                .then(std::bind(&migrator::drop_legacy_tables, this))
                                                .then([] { mlogger.info("Completed migration of legacy schema tables"); });
        });
    }

+    sharded<service::storage_proxy>& _sp;
    cql3::query_processor& _qp;
    std::vector<keyspace> _keyspaces;
 };
@@ -620,7 +638,7 @@ const std::unordered_set<sstring> migrator::legacy_schema_tables = {
 }

 future<>
-db::legacy_schema_migrator::migrate(cql3::query_processor& qp) {
-    return do_with(migrator(qp), std::bind(&migrator::migrate, std::placeholders::_1));
+db::legacy_schema_migrator::migrate(sharded<service::storage_proxy>& sp, cql3::query_processor& qp) {
+    return do_with(migrator(sp, qp), std::bind(&migrator::migrate, std::placeholders::_1));
 }

--- a/db/legacy_schema_migrator.hh
+++ b/db/legacy_schema_migrator.hh
@@ -48,10 +48,14 @@ namespace cql3 {
 class query_processor;
 }

+namespace service {
+class storage_proxy;
+}
+
 namespace db {
 namespace legacy_schema_migrator {

-future<> migrate(cql3::query_processor&);
+future<> migrate(sharded<service::storage_proxy>&, cql3::query_processor&);

 }
 }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -64,6 +64,7 @@
 #include "db/config.hh"
 #include "md5_hasher.hh"

+#include <boost/algorithm/string/predicate.hpp>
 #include <boost/range/algorithm/copy.hpp>
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/join.hpp>
@@ -82,6 +83,8 @@ namespace schema_tables {

 logging::logger slogger("schema_tables");

+const sstring version = "3";
+
 struct push_back_and_return {
    std::vector<mutation> muts;

@@ -149,8 +152,8 @@ static void add_index_to_schema_mutation(schema_ptr table,
                const index_metadata& index, api::timestamp_type timestamp,
                mutation& mutation);

-static void drop_column_from_schema_mutation(schema_ptr,
-                const column_definition&, long timestamp,
+static void drop_column_from_schema_mutation(schema_ptr schema_table, schema_ptr table,
+                const sstring& column_name, long timestamp,
                std::vector<mutation>&);

 static void drop_index_from_schema_mutation(schema_ptr table,
@@ -165,13 +168,12 @@ static void prepare_builder_from_table_row(schema_builder&, const query::result_

 using namespace v3;

-std::vector<const char*> ALL { KEYSPACES, TABLES, COLUMNS, DROPPED_COLUMNS, TRIGGERS, VIEWS, TYPES, FUNCTIONS, AGGREGATES, INDEXES };
+std::vector<const char*> ALL { KEYSPACES, TABLES, SCYLLA_TABLES, COLUMNS, DROPPED_COLUMNS, TRIGGERS, VIEWS, TYPES, FUNCTIONS, AGGREGATES, INDEXES };

 using days = std::chrono::duration<int, std::ratio<24 * 3600>>;

-/** add entries to system.schema_* for the hardcoded system definitions */
-future<> save_system_keyspace_schema() {
-    auto& ks = db::qctx->db().find_keyspace(NAME);
+future<> save_system_schema(const sstring & ksname) {
+    auto& ks = db::qctx->db().find_keyspace(ksname);
    auto ksm = ks.metadata();

    // delete old, possibly obsolete entries in schema tables
@@ -185,6 +187,11 @@ future<> save_system_keyspace_schema() {
    });
 }

+/** add entries to system_schema.* for the hardcoded system definitions */
+future<> save_system_keyspace_schema() {
+    return save_system_schema(NAME);
+}
+
 namespace v3 {

 static constexpr auto schema_gc_grace = std::chrono::duration_cast<std::chrono::seconds>(days(7)).count();
@@ -256,6 +263,21 @@ schema_ptr tables() {
    return schema;
 }

+// Holds Scylla-specific table metadata.
+schema_ptr scylla_tables() {
+    static thread_local auto schema = [] {
+        auto id = generate_legacy_id(NAME, SCYLLA_TABLES);
+        return schema_builder(NAME, SCYLLA_TABLES, stdx::make_optional(id))
+            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
+            .with_column("table_name", utf8_type, column_kind::clustering_key)
+            .with_column("version", uuid_type)
+            .set_gc_grace_seconds(schema_gc_grace)
+            .with_version(generate_schema_version(id))
+            .build();
+    }();
+    return schema;
+}
+
 schema_ptr columns() {
    static thread_local auto schema = [] {
        schema_builder builder(make_lw_shared(::schema(generate_legacy_id(NAME, COLUMNS), NAME, COLUMNS,
@@ -519,7 +541,7 @@ future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>&
            for (auto&& p : rs->partitions()) {
                auto mut = p.mut().unfreeze(s);
                auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
-                if (partition_key == NAME) {
+                if (is_system_keyspace(partition_key)) {
                    continue;
                }
                mutations.emplace_back(std::move(mut));
@@ -552,7 +574,7 @@ future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<ser
            for (auto&& p : rs->partitions()) {
                auto mut = p.mut().unfreeze(s);
                auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
-                if (partition_key == NAME) {
+                if (is_system_keyspace(partition_key)) {
                    continue;
                }
                results.emplace_back(std::move(p.mut()));
@@ -727,6 +749,33 @@ read_tables_for_keyspaces(distributed<service::storage_proxy>& proxy, const std:
    return result;
 }

+mutation compact_for_schema_digest(const mutation& m) {
+    // Cassandra is skipping tombstones from digest calculation
+    // to avoid disagreements due to tombstone GC.
+    // See https://issues.apache.org/jira/browse/CASSANDRA-6862.
+    // We achieve similar effect with compact_for_compaction().
+    mutation m_compacted(m);
+    m_compacted.partition().compact_for_compaction(*m.schema(), always_gc, gc_clock::time_point::max());
+    return m_compacted;
+}
+
+// Applies deletion of the "version" column to a system_schema.scylla_tables mutation.
+static void delete_schema_version(mutation& m) {
+    if (m.column_family_id() != scylla_tables()->id()) {
+        return;
+    }
+    const column_definition& version_col = *scylla_tables()->get_column_definition(to_bytes("version"));
+    for (auto&& row : m.partition().clustered_rows()) {
+        auto&& cells = row.row().cells();
+        auto&& cell = cells.find_cell(version_col.id);
+        api::timestamp_type t = api::new_timestamp();
+        if (cell) {
+            t = std::max(t, cell->as_atomic_cell().timestamp());
+        }
+        cells.apply(version_col, atomic_cell::make_dead(t, gc_clock::now()));
+    }
+}
+
 static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
 {
   return seastar::async([&proxy, mutations = std::move(mutations), do_flush] () mutable {
@@ -737,6 +786,9 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
       for (auto&& mutation : mutations) {
           keyspaces.emplace(value_cast<sstring>(utf8_type->deserialize(mutation.key().get_component(*s, 0))));
           column_families.emplace(mutation.column_family_id());
+           // We must force recalculation of schema version after the merge, since the resulting
+           // schema may be a mix of the old and new schemas.
+           delete_schema_version(mutation);
       }

       // current state of the schema
@@ -1387,7 +1439,7 @@ static void add_table_params_to_mutations(mutation& m, const clustering_key& cke

    {
        auto map = table->compaction_strategy_options();
-        map["class"] = sstables::compaction_strategy::name(table->compaction_strategy());
+        map["class"] = sstables::compaction_strategy::name(table->configured_compaction_strategy());
        store_map(m, ckey, "compaction", timestamp, map);
    }

@@ -1461,6 +1513,15 @@ static void add_dropped_column_to_schema_mutation(schema_ptr table, const sstrin
    m.set_clustered_cell(ckey, "type", expand_user_type(column.type)->as_cql3_type()->to_string(), timestamp);
 }

+static mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type timestamp) {
+    schema_ptr s = tables();
+    auto pkey = partition_key::from_singular(*s, table->ks_name());
+    auto ckey = clustering_key::from_singular(*s, table->cf_name());
+    mutation m(pkey, scylla_tables());
+    m.set_clustered_cell(ckey, "version", utils::UUID(table->version()), timestamp);
+    return m;
+}
+
 static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_type timestamp, bool with_columns_and_triggers)
 {
    // When adding new schema properties, don't set cells for default values so that
@@ -1474,6 +1535,8 @@ static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_ty
    auto ckey = clustering_key::from_singular(*s, table->cf_name());
    m.set_clustered_cell(ckey, "id", table->id(), timestamp);

+    auto scylla_tables_mutation = make_scylla_tables_mutation(table, timestamp);
+
    {
        list_type_impl::native_type flags;
        if (table->is_super()) {
@@ -1499,7 +1562,7 @@ static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_ty
    mutation indices_mutation(pkey, indexes());

    if (with_columns_and_triggers) {
-        for (auto&& column : table->all_columns()) {
+        for (auto&& column : table->v3().all_columns()) {
            add_column_to_schema_mutation(table, column, timestamp, columns_mutation);
        }
        for (auto&& index : table->indices()) {
@@ -1512,7 +1575,8 @@ static schema_mutations make_table_mutations(schema_ptr table, api::timestamp_ty
        }
    }

-    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation)};
+    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation),
+                            std::move(scylla_tables_mutation)};
 }

 void add_table_or_view_to_schema_mutation(schema_ptr s, api::timestamp_type timestamp, bool with_columns, std::vector<mutation>& mutations)
@@ -1561,23 +1625,23 @@ static void make_update_columns_mutations(schema_ptr old_table,
        std::vector<mutation>& mutations) {
    mutation columns_mutation(partition_key::from_singular(*columns(), old_table->ks_name()), columns());

-    auto diff = difference(old_table->columns_by_name(), new_table->columns_by_name());
+    auto diff = difference(old_table->v3().columns_by_name(), new_table->v3().columns_by_name());

    // columns that are no longer needed
    for (auto&& name : diff.entries_only_on_left) {
        // Thrift only knows about the REGULAR ColumnDefinition type, so don't consider other type
        // are being deleted just because they are not here.
-        const column_definition& column = *old_table->columns_by_name().at(name);
+        const column_definition& column = *old_table->v3().columns_by_name().at(name);
        if (from_thrift && !column.is_regular()) {
            continue;
        }

-        drop_column_from_schema_mutation(old_table, column, timestamp, mutations);
+        drop_column_from_schema_mutation(columns(), old_table, column.name_as_text(), timestamp, mutations);
    }

    // newly added columns and old columns with updated attributes
    for (auto&& name : boost::range::join(diff.entries_differing, diff.entries_only_on_right)) {
-        const column_definition& column = *new_table->columns_by_name().at(name);
+        const column_definition& column = *new_table->v3().columns_by_name().at(name);
        add_column_to_schema_mutation(new_table, column, timestamp, columns_mutation);
    }

@@ -1588,7 +1652,7 @@ static void make_update_columns_mutations(schema_ptr old_table,

    // newly dropped columns
    // columns added then dropped again
-    for (auto& name : dc_diff.entries_only_on_right) {
+    for (auto& name : boost::range::join(dc_diff.entries_differing, dc_diff.entries_only_on_right)) {
        add_drop_column_to_mutations(new_table, name, new_table->dropped_columns().at(name), timestamp, mutations);
    }
 }
@@ -1626,12 +1690,20 @@ static void make_drop_table_or_view_mutations(schema_ptr schema_table,
            api::timestamp_type timestamp,
            std::vector<mutation>& mutations) {
    auto pkey = partition_key::from_singular(*schema_table, table_or_view->ks_name());
-    mutation m{std::move(pkey), schema_table};
+    mutation m{pkey, schema_table};
    auto ckey = clustering_key::from_singular(*schema_table, table_or_view->cf_name());
-    m.partition().apply_delete(*schema_table, std::move(ckey), tombstone(timestamp, gc_clock::now()));
+    m.partition().apply_delete(*schema_table, ckey, tombstone(timestamp, gc_clock::now()));
    mutations.emplace_back(m);
-    for (auto &column : table_or_view->all_columns()) {
-        drop_column_from_schema_mutation(table_or_view, column, timestamp, mutations);
+    for (auto& column : table_or_view->v3().all_columns()) {
+        drop_column_from_schema_mutation(columns(), table_or_view, column.name_as_text(), timestamp, mutations);
+    }
+    for (auto& column : table_or_view->dropped_columns() | boost::adaptors::map_keys) {
+        drop_column_from_schema_mutation(dropped_columns(), table_or_view, column, timestamp, mutations);
+    }
+    {
+        mutation m{pkey, scylla_tables()};
+        m.partition().apply_delete(*scylla_tables(), ckey, tombstone(timestamp, gc_clock::now()));
+        mutations.emplace_back(m);
    }
 }

@@ -1655,17 +1727,14 @@ future<std::vector<mutation>> make_drop_table_mutations(lw_shared_ptr<keyspace_m

 static future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy, const qualified_name& table, schema_ptr s)
 {
-    return read_schema_partition_for_table(proxy, s, table.keyspace_name, table.table_name)
-        .then([&proxy, table] (mutation cf_m) {
-            return read_schema_partition_for_table(proxy, columns(), table.keyspace_name, table.table_name)
-                .then([&proxy, table, cf_m = std::move(cf_m)] (mutation col_m) {
-                return read_schema_partition_for_table(proxy, dropped_columns(), table.keyspace_name, table.table_name)
-                    .then([&proxy, table, cf_m = std::move(cf_m), col_m = std::move(col_m)] (mutation dropped_m) {
-                        return read_schema_partition_for_table(proxy, indexes(), table.keyspace_name, table.table_name)
-                            .then([cf_m = std::move(cf_m), col_m = std::move(col_m), dropped_m = std::move(dropped_m)] (mutation idx_m) {
-                                return schema_mutations{std::move(cf_m), std::move(col_m), std::move(idx_m), std::move(dropped_m)};
-                        });
-                    });
+    return when_all_succeed(
+        read_schema_partition_for_table(proxy, s, table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, columns(), table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, dropped_columns(), table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, indexes(), table.keyspace_name, table.table_name),
+        read_schema_partition_for_table(proxy, scylla_tables(), table.keyspace_name, table.table_name)).then(
+            [] (mutation cf_m, mutation col_m, mutation dropped_m, mutation idx_m, mutation st_m) {
+                return schema_mutations{std::move(cf_m), std::move(col_m), std::move(idx_m), std::move(dropped_m), std::move(st_m)};
            });
 #if 0
        // FIXME:
@@ -1680,7 +1749,6 @@ static future<schema_mutations> read_table_mutations(distributed<service::storag
        throw new RuntimeException(e);
    }
 #endif
-    });
 }

 future<schema_ptr> create_table_from_name(distributed<service::storage_proxy>& proxy, const sstring& keyspace, const sstring& table)
@@ -1771,7 +1839,7 @@ static void prepare_builder_from_table_row(schema_builder& builder, const query:
            builder.set_min_compaction_threshold(std::stoi(map["min_threshold"]));
        }
        if (map.count("enabled")) {
-            // TODO: enable/disable?
+            builder.set_compaction_enabled(boost::algorithm::iequals(map["enabled"], "true"));
        }

        builder.set_compaction_strategy_options(map);
@@ -1870,9 +1938,8 @@ schema_ptr create_table_from_mutations(schema_mutations sm, std::experimental::o

    prepare_builder_from_table_row(builder, table_row);

-    for (auto&& cdef : column_defs) {
-        builder.with_column(cdef);
-    }
+    v3_columns columns(std::move(column_defs), is_dense, is_compound);
+    columns.apply_to(builder);

    std::vector<index_metadata> index_defs;
    if (sm.indices_mutation()) {
@@ -1909,7 +1976,8 @@ static void add_column_to_schema_mutation(schema_ptr table,
                                   api::timestamp_type timestamp,
                                   mutation& m)
 {
-    auto ckey = clustering_key::from_exploded(*m.schema(), {utf8_type->decompose(table->cf_name()), column.name()});
+    auto ckey = clustering_key::from_exploded(*m.schema(), {utf8_type->decompose(table->cf_name()),
+                                                            utf8_type->decompose(column.name_as_text())});

    auto order = "NONE";
    if (column.is_clustering_key()) {
@@ -2003,13 +2071,19 @@ static void drop_index_from_schema_mutation(schema_ptr table, const index_metada
    mutations.push_back(std::move(m));
 }

-static void drop_column_from_schema_mutation(schema_ptr table, const column_definition& column, long timestamp, std::vector<mutation>& mutations) {
-    schema_ptr s = columns();
-    auto pkey = partition_key::from_singular(*s, table->ks_name());
-    auto ckey = clustering_key::from_exploded(*s, {utf8_type->decompose(table->cf_name()), column.name()});
+static void drop_column_from_schema_mutation(
+        schema_ptr schema_table,
+        schema_ptr table,
+        const sstring& column_name,
+        long timestamp,
+        std::vector<mutation>& mutations)
+{
+    auto pkey = partition_key::from_singular(*schema_table, table->ks_name());
+    auto ckey = clustering_key::from_exploded(*schema_table, {utf8_type->decompose(table->cf_name()),
+                                                              utf8_type->decompose(column_name)});

-    mutation m{pkey, s};
-    m.partition().apply_delete(*s, ckey, tombstone(timestamp, gc_clock::now()));
+    mutation m{pkey, schema_table};
+    m.partition().apply_delete(*schema_table, ckey, tombstone(timestamp, gc_clock::now()));
    mutations.emplace_back(m);
 }

@@ -2153,7 +2227,7 @@ static schema_mutations make_view_mutations(view_ptr view, api::timestamp_type t
    mutation indices_mutation(pkey, indexes());

    if (with_columns) {
-        for (auto&& column : view->all_columns()) {
+        for (auto&& column : view->v3().all_columns()) {
            add_column_to_schema_mutation(view, column, timestamp, columns_mutation);
        }

@@ -2165,7 +2239,10 @@ static schema_mutations make_view_mutations(view_ptr view, api::timestamp_type t
        }
    }

-    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation)};
+    auto scylla_tables_mutation = make_scylla_tables_mutation(view, timestamp);
+
+    return schema_mutations{std::move(m), std::move(columns_mutation), std::move(indices_mutation), std::move(dropped_columns_mutation),
+                            std::move(scylla_tables_mutation)};
 }

 schema_mutations make_schema_mutations(schema_ptr s, api::timestamp_type timestamp, bool with_columns)
@@ -2459,10 +2536,33 @@ data_type parse_type(sstring str)

 std::vector<schema_ptr> all_tables() {
    return {
-        keyspaces(), tables(), columns(), dropped_columns(), triggers(),
+        keyspaces(), tables(), scylla_tables(), columns(), dropped_columns(), triggers(),
        views(), indexes(), types(), functions(), aggregates(),
    };
 }

+namespace legacy {
+
+table_schema_version schema_mutations::digest() const {
+    md5_hasher h;
+    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies);
+    db::schema_tables::feed_hash_for_schema_digest(h, _columns);
+    return utils::UUID_gen::get_name_UUID(h.finalize());
+}
+
+future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy,
+    sstring keyspace_name, sstring table_name, schema_ptr s)
+{
+    return read_schema_partition_for_table(proxy, s, keyspace_name, table_name)
+        .then([&proxy, keyspace_name, table_name] (mutation cf_m) {
+            return read_schema_partition_for_table(proxy, db::system_keyspace::legacy::columns(), keyspace_name, table_name)
+                .then([cf_m = std::move(cf_m)] (mutation col_m) {
+                    return schema_mutations{std::move(cf_m), std::move(col_m)};
+                });
+        });
+}
+
+} // namespace legacy
+
 } // namespace schema_tables
 } // namespace schema
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -64,6 +64,7 @@ namespace v3 {
 static constexpr auto NAME = "system_schema";
 static constexpr auto KEYSPACES = "keyspaces";
 static constexpr auto TABLES = "tables";
+static constexpr auto SCYLLA_TABLES = "scylla_tables";
 static constexpr auto COLUMNS = "columns";
 static constexpr auto DROPPED_COLUMNS = "dropped_columns";
 static constexpr auto TRIGGERS = "triggers";
@@ -77,16 +78,43 @@ schema_ptr columns();
 schema_ptr dropped_columns();
 schema_ptr indexes();
 schema_ptr tables();
+schema_ptr scylla_tables();
 schema_ptr views();

 }

+namespace legacy {
+
+class schema_mutations {
+    mutation _columnfamilies;
+    mutation _columns;
+public:
+    schema_mutations(mutation columnfamilies, mutation columns)
+        : _columnfamilies(std::move(columnfamilies))
+        , _columns(std::move(columns))
+    { }
+    table_schema_version digest() const;
+};
+
+future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy,
+    sstring keyspace_name, sstring table_name, schema_ptr s);
+
+}
+
 using namespace v3;

+// Change on non-backwards compatible changes of schema mutations.
+// Replication of schema between nodes with different version is inhibited.
+extern const sstring version;
+
 extern std::vector<const char*> ALL;

 std::vector<schema_ptr> all_tables();

+// saves/creates "ks" + all tables etc, while first deleting all old schema entries (will be rewritten)
+future<> save_system_schema(const sstring & ks);
+
+// saves/creates "system_schema" keyspace
 future<> save_system_keyspace_schema();

 future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>& proxy);
@@ -153,15 +181,11 @@ data_type parse_type(sstring str);
 sstring serialize_index_kind(index_metadata_kind kind);
 index_metadata_kind deserialize_index_kind(sstring kind);

+mutation compact_for_schema_digest(const mutation& m);
+
 template<typename Hasher>
 void feed_hash_for_schema_digest(Hasher& h, const mutation& m) {
-    // Cassandra is skipping tombstones from digest calculation
-    // to avoid disagreements due to tombstone GC.
-    // See https://issues.apache.org/jira/browse/CASSANDRA-6862.
-    // We achieve similar effect with compact_for_compaction().
-    mutation m_compacted(m);
-    m_compacted.partition().compact_for_compaction(*m.schema(), always_gc, gc_clock::time_point::max());
-    feed_hash(h, m_compacted);
+    feed_hash(h, compact_for_schema_digest(m));
 }

 } // namespace schema_tables
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1044,6 +1044,9 @@ future<> setup(distributed<database>& db, distributed<cql3::query_processor>& qp
        return check_health();
    }).then([] {
        return db::schema_tables::save_system_keyspace_schema();
+    }).then([] {
+        // #2514 - make sure "system" is written to system_schema.keyspaces.
+        return db::schema_tables::save_system_schema(NAME);
    }).then([] {
        return netw::get_messaging_service().invoke_on_all([] (auto& ms){
            return ms.init_local_preferred_ip_cache();
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -62,6 +62,8 @@ namespace cql3 {
    class query_processor;
 }

+bool is_system_keyspace(const sstring& ks_name);
+
 namespace db {
 namespace system_keyspace {

@@ -120,6 +122,18 @@ extern schema_ptr hints();
 extern schema_ptr batchlog();
 extern schema_ptr built_indexes(); // TODO (from Cassandra): make private

+namespace legacy {
+
+schema_ptr keyspaces();
+schema_ptr column_families();
+schema_ptr columns();
+schema_ptr triggers();
+schema_ptr usertypes();
+schema_ptr functions();
+schema_ptr aggregates();
+
+}
+
 table_schema_version generate_schema_version(utils::UUID table_id);

 // Only for testing.
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -260,6 +260,27 @@ unsigned shard_of(const token& t) {
    return global_partitioner().shard_of(t);
 }

+stdx::optional<dht::token_range>
+selective_token_range_sharder::next() {
+    if (_done) {
+        return {};
+    }
+    while (_range.overlaps(dht::token_range(_start_boundary, {}), dht::token_comparator())
+            && !(_start_boundary && _start_boundary->value() == maximum_token())) {
+        auto end_token = _partitioner.token_for_next_shard(_start_token, _next_shard);
+        auto candidate = dht::token_range(std::move(_start_boundary), range_bound<dht::token>(end_token, false));
+        auto intersection = _range.intersection(std::move(candidate), dht::token_comparator());
+        _start_token = _partitioner.token_for_next_shard(end_token, _shard);
+        _start_boundary = range_bound<dht::token>(_start_token);
+        if (intersection) {
+            return *intersection;
+        }
+    }
+
+    _done = true;
+    return {};
+}
+
 stdx::optional<ring_position_range_and_shard>
 ring_position_range_sharder::next(const schema& s) {
    if (_done) {
@@ -462,14 +483,13 @@ int ring_position_comparator::operator()(ring_position_view lh, ring_position_vi
    }
 }

-int ring_position_comparator::operator()(ring_position_view lh, sstables::key_view rh) const {
-    auto rh_token = global_partitioner().get_token(rh);
-    auto token_cmp = tri_compare(*lh._token, rh_token);
+int ring_position_comparator::operator()(ring_position_view lh, sstables::decorated_key_view rh) const {
+    auto token_cmp = tri_compare(*lh._token, rh.token());
    if (token_cmp) {
        return token_cmp;
    }
    if (lh._key) {
-        auto rel = rh.tri_compare(s, *lh._key);
+        auto rel = rh.key().tri_compare(s, *lh._key);
        if (rel) {
            return -rel;
        }
@@ -477,7 +497,7 @@ int ring_position_comparator::operator()(ring_position_view lh, sstables::key_vi
    return lh._weight;
 }

-int ring_position_comparator::operator()(sstables::key_view a, ring_position_view b) const {
+int ring_position_comparator::operator()(sstables::decorated_key_view a, ring_position_view b) const {
    return -(*this)(b, a);
 }

--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -55,6 +55,7 @@
 namespace sstables {

 class key_view;
+class decorated_key_view;

 }

@@ -547,8 +548,8 @@ struct ring_position_comparator {
    const schema& s;
    ring_position_comparator(const schema& s_) : s(s_) {}
    int operator()(ring_position_view, ring_position_view) const;
-    int operator()(ring_position_view, sstables::key_view) const;
-    int operator()(sstables::key_view, ring_position_view) const;
+    int operator()(ring_position_view, sstables::decorated_key_view) const;
+    int operator()(sstables::decorated_key_view, ring_position_view) const;
 };

 // "less" comparator giving the same order as ring_position_comparator
@@ -671,6 +672,29 @@ split_ranges_to_shards(const dht::token_range_vector& ranges, const schema& s);
 std::vector<partition_range> split_range_to_single_shard(const schema& s, const dht::partition_range& pr, shard_id shard);
 std::vector<partition_range> split_range_to_single_shard(const i_partitioner& partitioner, const schema& s, const dht::partition_range& pr, shard_id shard);

+class selective_token_range_sharder {
+    const i_partitioner& _partitioner;
+    dht::token_range _range;
+    shard_id _shard;
+    bool _done = false;
+    shard_id _next_shard;
+    dht::token _start_token;
+    stdx::optional<range_bound<dht::token>> _start_boundary;
+public:
+    explicit selective_token_range_sharder(dht::token_range range, shard_id shard)
+            : selective_token_range_sharder(global_partitioner(), std::move(range), shard) {}
+    selective_token_range_sharder(const i_partitioner& partitioner, dht::token_range range, shard_id shard)
+            : _partitioner(partitioner)
+            , _range(std::move(range))
+            , _shard(shard)
+            , _next_shard(_shard + 1 == _partitioner.shard_count() ? 0 : _shard + 1)
+            , _start_token(_range.start() ? _range.start()->value() : minimum_token())
+            , _start_boundary(_partitioner.shard_of(_start_token) == shard ?
+                _range.start() : range_bound<dht::token>(_partitioner.token_for_next_shard(_start_token, shard))) {
+    }
+    stdx::optional<dht::token_range> next();
+};
+
 } // dht

 namespace std {
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -79,13 +79,14 @@ if [ $LOCALRPM -eq 1 ]; then
            cd ../..
            cp build/scylla-jmx/build/rpms/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
        fi
-        if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ]; then
+        if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ] || [ ! -f dist/ami/files/scylla-tools-core.noarch.rpm ]; then
            cd build
            git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
            cd scylla-tools-java
            sh -x -e dist/redhat/build_rpm.sh
            cd ../..
            cp build/scylla-tools-java/build/rpms/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
+            cp build/scylla-tools-java/build/rpms/scylla-tools-core-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools-core.noarch.rpm
        fi
    else
        sudo apt-get install -y git
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/scripts/scylla_cpuscaling_setup
+++ b/dist/common/scripts/scylla_cpuscaling_setup
@@ -4,6 +4,10 @@

 . /usr/lib/scylla/scylla_lib.sh

+if [ ! -f /sys/devices/system/cpu/cpufreq/policy0/scaling_governor ]; then
+    echo "This computer doesn't supported CPU scaling configuration."
+    exit 0
+fi
 if is_debian_variant; then
    apt-get install -y cpufrequtils
    service cpufrequtils stop
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -104,7 +104,11 @@ else
    mdadm --create --verbose --force --run $RAID --level=0 -c1024 --raid-devices=$NR_DISK $DISKS
    mkfs.xfs $RAID -f -K
 fi
-mdadm --detail --scan > /etc/mdadm.conf
+if is_debian_variant; then
+    mdadm --detail --scan > /etc/mdadm/mdadm.conf
+else
+    mdadm --detail --scan > /etc/mdadm.conf
+fi

 mkdir -p "$MOUNT_AT"
 mount -t xfs -o noatime $RAID "$MOUNT_AT"
@@ -122,3 +126,7 @@ if [ $FSTAB -ne 0 ]; then
    UUID=`blkid $RAID | awk '{print $2}'`
    echo "$UUID $MOUNT_AT xfs noatime 0 0" >> /etc/fstab
 fi
+
+if is_debian_variant; then
+    update-initramfs -u
+fi
--- a/dist/common/systemd/scylla-server.service.in
+++ b/dist/common/systemd/scylla-server.service.in
@@ -1,6 +1,6 @@
 [Unit]
 Description=Scylla Server
-After=network.target
+After=network-online.target
 Wants=scylla-jmx.service
 Wants=scylla-housekeeping-restart.timer
 Wants=scylla-housekeeping-daily.timer
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -129,9 +129,11 @@ sed -i -e "s/@@CODENAME@@/$TARGET/g" debian/changelog
 cp dist/debian/rules.in debian/rules
 cp dist/debian/control.in debian/control
 cp dist/debian/scylla-server.install.in debian/scylla-server.install
+cp dist/debian/scylla-conf.preinst.in debian/scylla-conf.preinst
+sed -i -e "s/@@VERSION@@/$SCYLLA_VERSION/g" debian/scylla-conf.preinst
 if [ "$TARGET" = "jessie" ]; then
    cp dist/debian/scylla-server.cron.d debian/
-    sed -i -e "s/@@REVISION@@/1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++-5, libunwind-dev/g" debian/control
@@ -145,7 +147,7 @@ if [ "$TARGET" = "jessie" ]; then
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
 elif [ "$TARGET" = "stretch" ] || [ "$TARGET" = "buster" ] || [ "$TARGET" = "sid" ]; then
    cp dist/debian/scylla-server.cron.d debian/
-    sed -i -e "s/@@REVISION@@/1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind8-dev/g" debian/control
@@ -159,7 +161,7 @@ elif [ "$TARGET" = "stretch" ] || [ "$TARGET" = "buster" ] || [ "$TARGET" = "sid
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
 elif [ "$TARGET" = "trusty" ]; then
    cp dist/debian/scylla-server.cron.d debian/
-    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@/--upstart-only/g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/g++-5, libunwind8-dev/g" debian/control
@@ -172,7 +174,7 @@ elif [ "$TARGET" = "trusty" ]; then
    sed -i -e "s#@@SCRIPTS_FSTRIM@@#dist/debian/scripts/scylla_fstrim usr/lib/scylla#g" debian/scylla-server.install
    sed -i -e "s#@@SCRIPTS_DELAY_FSTRIM@@#dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla#g" debian/scylla-server.install
 elif [ "$TARGET" = "xenial" ] || [ "$TARGET" = "yakkety" ] || [ "$TARGET" = "zesty" ] || [ "$TARGET" = "artful" ]; then
-    sed -i -e "s/@@REVISION@@/0ubuntu1/g" debian/changelog
+    sed -i -e "s/@@REVISION@@/0ubuntu1~$TARGET/g" debian/changelog
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++, libunwind-dev/g" debian/control
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -7,7 +7,8 @@ KVER=$(uname -r)
 if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
    echo "kernel $KVER detected, skip running sysctl..."
 else
-    sysctl -p/etc/sysctl.d/99-scylla-sched.conf
+    # expect failures in virtualized environments
+    sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
 fi

 #DEBHELPER#
--- a/dist/debian/debian/scylla-server.postinst
+++ b/dist/debian/debian/scylla-server.postinst
@@ -3,12 +3,22 @@
 set -e

 if [ "$1" = configure ]; then
-    adduser --system \
-            --quiet \
-            --home /var/lib/scylla \
-            --no-create-home \
-            --disabled-password \
-            --group scylla
+    getent passwd scylla || NOUSR=1
+    getent group scylla || NOGRP=1
+
+    # this handles both case group is not exist || group already exists
+    if [ $NOUSR ]; then
+        adduser --system \
+                --quiet \
+                --home /var/lib/scylla \
+                --no-create-home \
+                --disabled-password \
+                --group scylla
+    # only group is not exist, create it and add user to the group
+    elif [ $NOGRP ]; then
+        addgroup --system scylla
+        adduser scylla scylla
+    fi
    chown -R scylla:scylla /var/lib/scylla
    chown -R scylla:scylla /var/lib/scylla-housekeeping
 fi
--- a/dist/debian/scylla-conf.preinst.in
+++ b/dist/debian/scylla-conf.preinst.in
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+ver=$(dpkg -l|grep scylla-server|awk '{print $3}'|sed -e "s/-.*$//")
+if [ -n "$ver" ]; then
+    ver_fmt=$(echo $ver | awk -F. '{printf "%d%02d%02d", $1,$2,$3}')
+    if [ $ver_fmt -lt 10703 ]; then
+        # for <scylla-1.2
+        if [ ! -f /usr/lib/scylla/scylla_config_get.py ]; then
+            echo
+            echo "Error: Upgrading from scylla-$ver to scylla-@@VERSION@@ is not supported."
+            echo "Please upgrade to scylla-1.7.3 or later, before upgrade to @@VERSION@@."
+            echo
+            exit 1
+        fi
+        commitlog_directory=$(/usr/lib/scylla/scylla_config_get.py -g commitlog_directory)
+        commitlog_files=$(ls $commitlog_directory | wc -l)
+        if [ $commitlog_files -ne 0 ]; then
+            echo
+            echo "Error: Upgrading from scylla-$ver to scylla-@@VERSION@@ is not supported when commitlog is not clean."
+            echo "Please upgrade to scylla-1.7.3 or later, before upgrade to @@VERSION@@."
+            echo "Also make sure $commitlog_directory is empty."
+            echo
+            exit 1
+        fi
+    fi
+fi
+
+#DEBHELPER#
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -7,7 +7,7 @@ ENV container docker
 VOLUME [ "/sys/fs/cgroup" ]

 #install scylla
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-2.0.repo -o /etc/yum.repos.d/scylla.repo
 RUN yum -y install epel-release
 RUN yum -y clean expire-cache
 RUN yum -y update
--- a/dist/docker/redhat/scyllasetup.py
+++ b/dist/docker/redhat/scyllasetup.py
@@ -70,5 +70,7 @@ class ScyllaSetup:
        if self._experimental == "1":
            args += [ "--experimental=on" ]

+        args += ["--blocked-reactor-notify-ms 999999999"]
+
        with open("/etc/scylla.d/docker.conf", "w") as cqlshrc:
            cqlshrc.write("SCYLLA_DOCKER_ARGS=\"%s\"\n" % " ".join(args))
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{name}-@@VERSION@@-@@RELEASE@@.tar
-Requires:       scylla-server scylla-jmx scylla-tools scylla-kernel-conf
+Requires:       scylla-server = @@VERSION@@ scylla-jmx = @@VERSION@@ scylla-tools = @@VERSION@@ scylla-kernel-conf = @@VERSION@@
 Obsoletes:	scylla-server < 1.1

 %description
--- a/gms/application_state.hh
+++ b/gms/application_state.hh
@@ -59,8 +59,8 @@ enum class application_state {
    TOKENS,
    SUPPORTED_FEATURES,
    CACHE_HITRATES,
+    SCHEMA_TABLES_VERSION,
    // pad to allow adding new states to existing cluster
-    X3,
    X4,
    X5,
    X6,
--- a/gms/failure_detector.cc
+++ b/gms/failure_detector.cc
@@ -43,6 +43,7 @@
 #include "gms/endpoint_state.hh"
 #include "gms/application_state.hh"
 #include "gms/inet_address.hh"
+#include "service/storage_service.hh"
 #include "log.hh"
 #include <iostream>
 #include <chrono>
@@ -56,37 +57,13 @@ constexpr std::chrono::milliseconds failure_detector::DEFAULT_MAX_PAUSE;
 using clk = arrival_window::clk;

 static clk::duration get_initial_value() {
-#if 0
-    String newvalue = System.getProperty("cassandra.fd_initial_value_ms");
-    if (newvalue == null)
-    {
-        return Gossiper.intervalInMillis * 2;
-    }
-    else
-    {
-        logger.info("Overriding FD INITIAL_VALUE to {}ms", newvalue);
-        return Integer.parseInt(newvalue);
-    }
-#endif
-    warn(unimplemented::cause::GOSSIP);
-    return std::chrono::seconds(2);
+    auto& cfg = service::get_local_storage_service().db().local().get_config();
+    return std::chrono::milliseconds(cfg.fd_initial_value_ms());
 }

 clk::duration arrival_window::get_max_interval() {
-#if 0
-    sstring newvalue = System.getProperty("cassandra.fd_max_interval_ms");
-    if (newvalue == null)
-    {
-        return failure_detector.INITIAL_VALUE_NANOS;
-    }
-    else
-    {
-        logger.info("Overriding FD MAX_INTERVAL to {}ms", newvalue);
-        return TimeUnit.NANOSECONDS.convert(Integer.parseInt(newvalue), TimeUnit.MILLISECONDS);
-    }
-#endif
-    warn(unimplemented::cause::GOSSIP);
-    return get_initial_value();
+    auto& cfg = service::get_local_storage_service().db().local().get_config();
+    return std::chrono::milliseconds(cfg.fd_max_interval_ms());
 }

 void arrival_window::add(clk::time_point value, const gms::inet_address& ep) {
@@ -95,7 +72,7 @@ void arrival_window::add(clk::time_point value, const gms::inet_address& ep) {
        if (inter_arrival_time <= get_max_interval()) {
            _arrival_intervals.add(inter_arrival_time.count());
        } else  {
-            logger.debug("failure_detector: Ignoring interval time of {} for {}", inter_arrival_time.count(), ep);
+            logger.debug("failure_detector: Ignoring interval time of {} for {}, mean={}, size={}", inter_arrival_time.count(), ep, mean(), size());
        }
    } else {
        // We use a very large initial interval since the "right" average depends on the cluster size
--- a/gms/failure_detector.hh
+++ b/gms/failure_detector.hh
@@ -87,6 +87,8 @@ public:
    // see CASSANDRA-2597 for an explanation of the math at work here.
    double phi(clk::time_point tnow);

+    size_t size() { return _arrival_intervals.size(); }
+
    friend std::ostream& operator<<(std::ostream& os, const arrival_window& w);

 };
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -590,8 +590,11 @@ void gossiper::run() {
                /* Gossip to some random live members */
                // TODO: For now, we choose 10th of all the nodes in the cluster.
                auto nr_live_nodes = std::max(size_t(1), endpoint_state_map.size() / 10);
+                nr_live_nodes = std::min(nr_live_nodes, _live_endpoints.size());
                std::unordered_set<gms::inet_address> live_nodes;
-                while (live_nodes.size() < nr_live_nodes && !_live_endpoints.empty()) {
+                logger.debug("nr_live_nodes={}, endpoint_state_map.size()={}, live_endpoints.size={}",
+                    nr_live_nodes, endpoint_state_map.size(), _live_endpoints.size());
+                while (live_nodes.size() < nr_live_nodes && nr_live_nodes <= _live_endpoints.size()) {
                    if (!_live_endpoints_just_added.empty()) {
                        auto ep = _live_endpoints_just_added.front();
                        _live_endpoints_just_added.pop_front();
--- a/idl/frozen_schema.idl.hh
+++ b/idl/frozen_schema.idl.hh
@@ -27,8 +27,9 @@ class schema_mutations {
    canonical_mutation columnfamilies_canonical_mutation();
    canonical_mutation columns_canonical_mutation();
    bool is_view()[[version 1.6]];
-    std::experimental::optional<canonical_mutation> indices_canonical_mutation()[[version 1.9]];
-    std::experimental::optional<canonical_mutation> dropped_columns_canonical_mutation()[[version 1.9]];
+    std::experimental::optional<canonical_mutation> indices_canonical_mutation()[[version 2.0]];
+    std::experimental::optional<canonical_mutation> dropped_columns_canonical_mutation()[[version 2.0]];
+    std::experimental::optional<canonical_mutation> scylla_tables_canonical_mutation()[[version 2.0]];
 };

 class schema stub [[writable]] {
--- a/keys.hh
+++ b/keys.hh
@@ -182,6 +182,9 @@ public:
    static TopLevel from_exploded(const schema& s, const std::vector<bytes>& v) {
        return from_exploded(v);
    }
+    static TopLevel from_exploded_view(const std::vector<bytes_view>& v) {
+        return from_exploded(v);
+    }

    // We don't allow optional values, but provide this method as an efficient adaptor
    static TopLevel from_optional_exploded(const schema& s, const std::vector<bytes_opt>& v) {
--- a/main.cc
+++ b/main.cc
@@ -59,6 +59,8 @@ thread_local disk_error_signal_type commit_error;
 thread_local disk_error_signal_type general_disk_error;
 seastar::metrics::metric_groups app_metrics;

+using namespace std::chrono_literals;
+
 namespace bpo = boost::program_options;

 static boost::filesystem::path relative_conf_dir(boost::filesystem::path path) {
@@ -277,7 +279,10 @@ int main(int ac, char** av) {
    }
    runtime::init_uptime();
    std::setvbuf(stdout, nullptr, _IOLBF, 1000);
-    app_template app;
+    app_template::config app_cfg;
+    app_cfg.name = "Scylla";
+    app_cfg.default_task_quota = 500us;
+    app_template app(std::move(app_cfg));
    auto opt_add = app.add_options();

    auto cfg = make_lw_shared<db::config>();
@@ -534,7 +539,7 @@ int main(int ac, char** av) {
            db::system_keyspace::minimal_setup(db, qp);

            // schema migration, if needed, is also done on shard 0
-            db::legacy_schema_migrator::migrate(qp.local()).get();
+            db::legacy_schema_migrator::migrate(proxy, qp.local()).get();

            supervisor::notify("loading sstables");

@@ -625,13 +630,13 @@ int main(int ac, char** av) {
            lb->start_broadcasting();
            service::get_local_storage_service().set_load_broadcaster(lb);
            engine().at_exit([lb = std::move(lb)] () mutable { return lb->stop_broadcasting(); });
+            supervisor::notify("starting cf cache hit rate calculator");
            cf_cache_hitrate_calculator.start(std::ref(db), std::ref(cf_cache_hitrate_calculator)).get();
            engine().at_exit([&cf_cache_hitrate_calculator] { return cf_cache_hitrate_calculator.stop(); });
            cf_cache_hitrate_calculator.local().run_on(engine().cpu_id());
-            supervisor::notify("starting native transport");
-            gms::get_local_gossiper().wait_for_gossip_to_settle();
+            gms::get_local_gossiper().wait_for_gossip_to_settle().get();
            api::set_server_gossip_settle(ctx).get();
-            supervisor::notify("starting cf cache hit rate calculator");
+            supervisor::notify("starting native transport");
            service::get_local_storage_service().start_native_transport().get();
            if (start_thrift) {
                service::get_local_storage_service().start_rpc_server().get();
--- a/memtable-sstable.hh
+++ b/memtable-sstable.hh
@@ -29,11 +29,13 @@
 #include "sstables/sstables.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/file.hh>
+#include <seastar/core/thread.hh>

 future<>
 write_memtable_to_sstable(memtable& mt,
        sstables::shared_sstable sst,
        bool backup = false,
        const io_priority_class& pc = default_priority_class(),
-        bool leave_unsealed = false);
+        bool leave_unsealed = false,
+        seastar::thread_scheduling_group* tsg = nullptr);

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -835,7 +835,7 @@ future<> messaging_service::send_definitions_update(msg_addr id, std::vector<fro
    return send_message_oneway(this, messaging_verb::DEFINITIONS_UPDATE, std::move(id), std::move(fm));
 }

-void messaging_service::register_migration_request(std::function<future<std::vector<frozen_mutation>> ()>&& func) {
+void messaging_service::register_migration_request(std::function<future<std::vector<frozen_mutation>> (const rpc::client_info&)>&& func) {
    register_handler(this, netw::messaging_verb::MIGRATION_REQUEST, std::move(func));
 }
 void messaging_service::unregister_migration_request() {
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -288,7 +288,7 @@ public:
    future<> send_definitions_update(msg_addr id, std::vector<frozen_mutation> fm);

    // Wrapper for MIGRATION_REQUEST
-    void register_migration_request(std::function<future<std::vector<frozen_mutation>> ()>&& func);
+    void register_migration_request(std::function<future<std::vector<frozen_mutation>> (const rpc::client_info&)>&& func);
    void unregister_migration_request();
    future<std::vector<frozen_mutation>> send_migration_request(msg_addr id);

--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -932,15 +932,6 @@ rows_entry::equal(const schema& s, const rows_entry& other) const {
    return equal(s, other, s);
 }

-position_in_partition_view rows_entry::position() const {
-    if (_flags._last) {
-        return position_in_partition_view::after_all_clustered_rows();
-    } else {
-        return position_in_partition_view(
-            position_in_partition_view::clustering_row_tag_t(), _key);
-    }
-}
-
 bool
 rows_entry::equal(const schema& s, const rows_entry& other, const schema& other_schema) const {
    position_in_partition::equal_compare eq(s);
@@ -2119,7 +2110,7 @@ public:

 mutation_partition::mutation_partition(mutation_partition::incomplete_tag, const schema& s, tombstone t)
    : _tombstone(t)
-    , _static_row_continuous(false)
+    , _static_row_continuous(!s.has_static_columns())
    , _rows()
    , _row_tombstones(s)
 {
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -712,7 +712,15 @@ public:
    const deletable_row& row() const {
        return _row;
    }
-    position_in_partition_view position() const;
+    position_in_partition_view position() const {
+        if (_flags._last) {
+            return position_in_partition_view::after_all_clustered_rows();
+        } else {
+            return position_in_partition_view(
+                    position_in_partition_view::clustering_row_tag_t(), _key);
+        }
+    }
+
    is_continuous continuous() const { return is_continuous(_flags._continuous); }
    void set_continuous(bool value) { _flags._continuous = value; }
    void set_continuous(is_continuous value) { set_continuous(bool(value)); }
--- a/partition_slice_builder.cc
+++ b/partition_slice_builder.cc
@@ -105,7 +105,7 @@ partition_slice_builder::with_regular_column(bytes name) {
        throw std::runtime_error(sprint("No such column: %s", _schema.regular_column_name_type()->to_string(name)));
    }
    if (!def->is_regular()) {
-        throw std::runtime_error(sprint("Column is not regular: %s", _schema.regular_column_name_type()->to_string(name)));
+        throw std::runtime_error(sprint("Column is not regular: %s", _schema.column_name_type(*def)->to_string(name)));
    }
    _regular_columns->push_back(def->id);
    return *this;
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -168,12 +168,14 @@ public:
    const clustering_key& key() const { return _current_row[0].it->key(); }

    // Can be called only when cursor is valid and pointing at a row.
-    clustering_row row() const {
-        clustering_row result(key());
-        for (auto&& v : _current_row) {
-            result.apply(_schema, *v.it);
+    mutation_fragment row() const {
+        auto it = _current_row.begin();
+        auto mf = mutation_fragment(clustering_row(*it->it));
+        auto& cr = mf.as_mutable_clustering_row();
+        for (++it; it != _current_row.end(); ++it) {
+            cr.apply(_schema, *it->it);
        }
-        return result;
+        return mf;
    }

    // Can be called when cursor is pointing at a row, even when invalid.
@@ -198,8 +200,8 @@ bool partition_snapshot_row_cursor::previous_row_in_latest_version_has_key(const
    }
    auto prev_it = _current_row[0].it;
    --prev_it;
-    clustering_key_prefix::tri_compare tri_comp(_schema);
-    return tri_comp(prev_it->key(), key) == 0;
+    clustering_key_prefix::equality eq(_schema);
+    return eq(prev_it->key(), key);
 }

 inline
--- a/range.hh
+++ b/range.hh
@@ -352,10 +352,10 @@ public:
            return *this;
        }
    }
-    template<typename Transformer, typename U = typename std::result_of<Transformer(T)>::type>
-    static stdx::optional<typename wrapping_range<U>::bound> transform_bound(optional<bound> b, Transformer&& transformer) {
+    template<typename Bound, typename Transformer, typename U = typename std::result_of<Transformer(T)>::type>
+    static stdx::optional<typename wrapping_range<U>::bound> transform_bound(Bound&& b, Transformer&& transformer) {
        if (b) {
-            return { { transformer(std::move(*b).value()), b->is_inclusive() } };
+            return { { transformer(std::forward<Bound>(b).value().value()), b->is_inclusive() } };
        };
        return {};
    }
--- a/read_context.hh
+++ b/read_context.hh
@@ -71,7 +71,11 @@ public:
                _range = std::move(*new_range);
                _last_key = {};
            }
+            if (_reader) {
+                ++_cache._tracker._stats.underlying_recreations;
+            }
            auto& snap = _cache.snapshot_for_phase(phase);
+            _reader = {}; // See issue #2644
            _reader = _cache.create_underlying_reader(_read_context, snap, _range);
            _reader_creation_phase = phase;
        }
@@ -90,8 +94,14 @@ public:
        _range = std::move(range);
        _last_key = { };
        _new_last_key = { };
-        if (_reader && _reader_creation_phase == phase) {
-            return _reader->fast_forward_to(_range);
+        if (_reader) {
+            if (_reader_creation_phase == phase) {
+                ++_cache._tracker._stats.underlying_partition_skips;
+                return _reader->fast_forward_to(_range);
+            } else {
+                ++_cache._tracker._stats.underlying_recreations;
+                _reader = {}; // See issue #2644
+            }
        }
        _reader = _cache.create_underlying_reader(_read_context, snapshot, _range);
        _reader_creation_phase = phase;
@@ -121,6 +131,7 @@ class read_context final : public enable_lw_shared_from_this<read_context> {
    mutation_reader::forwarding _fwd_mr;
    bool _range_query;
    autoupdating_underlying_reader _underlying;
+    uint64_t _underlying_created = 0;

    // When reader enters a partition, it must be set up for reading that
    // partition from the underlying mutation source (_sm) in one of two ways:
@@ -155,7 +166,18 @@ public:
        , _fwd_mr(fwd_mr)
        , _range_query(!range.is_singular() || !range.start()->value().has_key())
        , _underlying(_cache, *this)
-    { }
+    {
+        ++_cache._tracker._stats.reads;
+    }
+    ~read_context() {
+        ++_cache._tracker._stats.reads_done;
+        if (_underlying_created) {
+            _cache._stats.reads_with_misses.mark();
+            ++_cache._tracker._stats.reads_with_misses;
+        } else {
+            _cache._stats.reads_with_no_misses.mark();
+        }
+    }
    read_context(const read_context&) = delete;
    row_cache& cache() { return _cache; }
    const schema_ptr& schema() const { return _schema; }
@@ -169,6 +191,7 @@ public:
    autoupdating_underlying_reader& underlying() { return _underlying; }
    row_cache::phase_type phase() const { return _phase; }
    const dht::decorated_key& key() const { return _sm->decorated_key(); }
+    void on_underlying_created() { ++_underlying_created; }
 private:
    future<> create_sm();
    future<> ensure_sm_created() {
@@ -198,9 +221,17 @@ public:
    // Fast forwards the underlying streamed_mutation to given range.
    future<> fast_forward_to(position_range range) {
        return ensure_sm_created().then([this, range = std::move(range)] () mutable {
+            ++_cache._tracker._stats.underlying_row_skips;
            return _sm->fast_forward_to(std::move(range));
        });
    }
+    // Returns the underlying streamed_mutation.
+    // The caller has to ensure that the streamed mutation was already created
+    // (e.g. the most recent call to enter_partition(const dht::decorated_key&, ...)
+    // was followed by a call to fast_forward_to()).
+    streamed_mutation& get_streamed_mutation() noexcept {
+        return *_sm;
+    }
    // Gets the next fragment from the underlying streamed_mutation
    future<mutation_fragment_opt> get_next_fragment() {
        return ensure_sm_created().then([this] {
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -41,11 +41,6 @@

 static logging::logger rlogger("repair");

-struct failed_range {
-    sstring cf;
-    ::dht::token_range range;
-};
-
 class repair_info {
 public:
    seastar::sharded<database>& db;
@@ -56,7 +51,7 @@ public:
    shard_id shard;
    std::vector<sstring> data_centers;
    std::vector<sstring> hosts;
-    std::vector<failed_range> failed_ranges;
+    size_t nr_failed_ranges = 0;
    // Map of peer -> <cf, ranges>
    std::unordered_map<gms::inet_address, std::unordered_map<sstring, dht::token_range_vector>> ranges_need_repair_in;
    std::unordered_map<gms::inet_address, std::unordered_map<sstring, dht::token_range_vector>> ranges_need_repair_out;
@@ -132,14 +127,11 @@ public:
        });
    }
    void check_failed_ranges() {
-        if (failed_ranges.empty()) {
-            rlogger.info("repair {} on shard {} completed successfully", id, shard);
+        if (nr_failed_ranges) {
+            rlogger.info("repair {} on shard {} failed - {} ranges failed", id, shard, nr_failed_ranges);
+            throw std::runtime_error(sprint("repair %d on shard %d failed to do checksum for %d sub ranges", id, shard, nr_failed_ranges));
        } else {
-            rlogger.info("repair {} on shard {} failed - {} ranges failed", id, shard, failed_ranges.size());
-            for (auto& frange: failed_ranges) {
-                rlogger.info("repair cf {} range {} failed", frange.cf, frange.range);
-            }
-            throw std::runtime_error(sprint("repair %d on shard %d failed to do checksum for %d sub ranges", id, shard, failed_ranges.size()));
+            rlogger.info("repair {} on shard {} completed successfully", id, shard);
        }
    }
    future<> request_transfer_ranges(const sstring& cf,
@@ -504,6 +496,19 @@ static future<partition_checksum> checksum_range_shard(database &db,
    });
 }

+// It is counter-productive to allow a large number of range checksum
+// operations to proceed in parallel (on the same shard), because the read
+// operation can already parallelize itself as much as needed, and doing
+// multiple reads in parallel just adds a lot of memory overheads.
+// So checksum_parallelism_semaphore is used to limit this parallelism,
+// and should be set to 1, or another small number.
+//
+// Note that checksumming_parallelism_semaphore applies not just in the
+// repair master, but also in the slave: The repair slave may receive many
+// checksum requests in parallel, but will only work on one or a few
+// (checksum_parallelism_semaphore) at once.
+static thread_local semaphore checksum_parallelism_semaphore(2);
+
 // Calculate the checksum of the data held on all shards of a column family,
 // in the given token range.
 // In practice, we only need to consider one or two shards which intersect the
@@ -526,7 +531,9 @@ future<partition_checksum> checksum_range(seastar::sharded<database> &db,
            auto& prs = shard_range.second;
            return db.invoke_on(shard, [keyspace, cf, prs = std::move(prs), hash_version] (database& db) mutable {
                return do_with(std::move(keyspace), std::move(cf), std::move(prs), [&db, hash_version] (auto& keyspace, auto& cf, auto& prs) {
-                    return checksum_range_shard(db, keyspace, cf, prs, hash_version);
+                    return seastar::with_semaphore(checksum_parallelism_semaphore, 1, [&db, hash_version, &keyspace, &cf, &prs] {
+                        return checksum_range_shard(db, keyspace, cf, prs, hash_version);
+                    });
                });
            }).then([&result] (partition_checksum sum) {
                result.add(sum);
@@ -537,14 +544,15 @@ future<partition_checksum> checksum_range(seastar::sharded<database> &db,
    });
 }

-// We don't need to wait for one checksum to finish before we start the
-// next, but doing too many of these operations in parallel also doesn't
-// make sense, so we limit the number of concurrent ongoing checksum
-// requests with a semaphore.
-//
-// FIXME: We shouldn't use a magic number here, but rather bind it to
-// some resource. Otherwise we'll be doing too little in some machines,
-// and too much in others.
+// parallelism_semaphore limits the number of parallel ongoing checksum
+// comparisons. This could mean, for example, that this number of checksum
+// requests have been sent to other nodes and we are waiting for them to
+// return so we can compare those to our own checksums. This limit can be
+// set fairly high because the outstanding comparisons take only few
+// resources. In particular, we do NOT do this number of file reads in
+// parallel because file reads have large memory overhads (read buffers,
+// partitions, etc.) - the number of concurrent reads is further limited
+// by an additional semaphore checksum_parallelism_semaphore (see above).
 //
 // FIXME: This would be better of in a repair service, or even a per-shard
 // repair instance holding all repair state. However, since we are anyway
@@ -576,7 +584,6 @@ static future<uint64_t> estimate_partitions(seastar::sharded<database>& db, cons
 static future<> repair_cf_range(repair_info& ri,
        sstring cf, ::dht::token_range range,
        const std::vector<gms::inet_address>& neighbors) {
-    ri.ranges_index++;
    if (neighbors.empty()) {
        // Nothing to do in this case...
        return make_ready_future<>();
@@ -584,8 +591,6 @@ static future<> repair_cf_range(repair_info& ri,

    return estimate_partitions(ri.db, ri.keyspace, cf, range).then([&ri, cf, range, &neighbors] (uint64_t estimated_partitions) {
    range_splitter ranges(range, estimated_partitions, ri.target_partitions);
-    rlogger.info("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, cf={}, range={}, target_partitions={}, estimated_partitions={}",
-            ri.ranges_index, ri.ranges.size(), ri.id, ri.shard, ri.keyspace, cf, range, ri.target_partitions, estimated_partitions);
    return do_with(seastar::gate(), true, std::move(cf), std::move(ranges),
        [&ri, &neighbors] (auto& completion, auto& success, const auto& cf, auto& ranges) {
        return do_until([&ranges] () { return !ranges.has_next(); },
@@ -626,7 +631,7 @@ static future<> repair_cf_range(repair_info& ri,
                                 utils::fb_utilities::get_broadcast_address()),
                                checksums[i].get_exception());
                            success = false;
-                            ri.failed_ranges.push_back(failed_range{cf, range});
+                            ri.nr_failed_ranges++;
                            // Do not break out of the loop here, so we can log
                            // (and discard) all the exceptions.
                        } else if (i > 0) {
@@ -751,7 +756,7 @@ static future<> repair_cf_range(repair_info& ri,
                    // any case, we need to remember that the repair failed to
                    // tell the caller.
                    success = false;
-                    ri.failed_ranges.push_back(failed_range{cf, range});
+                    ri.nr_failed_ranges++;
                    rlogger.warn("Failed sync of range {}: {}", range, eptr);
                }).finally([&completion] {
                    parallelism_semaphore.signal(1);
@@ -997,8 +1002,22 @@ static future<> repair_ranges(repair_info ri) {
        // repair all the ranges in sequence
        return do_for_each(ri.ranges, [&ri] (auto&& range) {
    #endif
-            check_in_shutdown();
-            return repair_range(ri, range);
+            ri.ranges_index++;
+            rlogger.info("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}",
+                ri.ranges_index, ri.ranges.size(), ri.id, ri.shard, ri.keyspace, ri.cfs, range);
+            return do_with(dht::selective_token_range_sharder(range, ri.shard), [&ri] (auto& sharder) {
+                return repeat([&ri, &sharder] () {
+                    check_in_shutdown();
+                    auto range_shard = sharder.next();
+                    if (range_shard) {
+                        return repair_range(ri, *range_shard).then([] {
+                            return make_ready_future<stop_iteration>(stop_iteration::no);
+                        });
+                    } else {
+                        return make_ready_future<stop_iteration>(stop_iteration::yes);
+                    }
+                });
+            });
        }).then([&ri] {
            // Do streaming for the remaining ranges we do not stream in
            // repair_cf_range
@@ -1013,27 +1032,6 @@ static future<> repair_ranges(repair_info ri) {
    });
 }

-static void split_and_add(std::vector<::dht::token_range>& ranges,
-        const dht::token_range& range) {
-    // The use of minimum_token() here twice is not a typo - because wrap-
-    // around token ranges are supported by midpoint(), the beyond-maximum
-    // token can also be represented by minimum_token().
-    auto midpoint = dht::global_partitioner().midpoint(
-            range.start() ? range.start()->value() : dht::minimum_token(),
-            range.end() ? range.end()->value() : dht::minimum_token());
-    // This shouldn't happen, but if the range included just one token, we
-    // can't split further (split() may actually fail with assertion failure)
-    if ((range.start() && midpoint == range.start()->value()) ||
-        (range.end() && midpoint == range.end()->value())) {
-        ranges.push_back(range);
-        return;
-    }
-    auto halves = range.split(midpoint, dht::token_comparator());
-    ranges.push_back(halves.first);
-    ranges.push_back(halves.second);
-}
-
-
 // repair_start() can run on any cpu; It runs on cpu0 the function
 // do_repair_start(). The benefit of always running that function on the same
 // CPU is that it allows us to keep some state (like a list of ongoing
@@ -1125,35 +1123,12 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
        cfs = list_column_families(db.local(), keyspace);
    }

-    // Split the ranges so that we have more number of ranges than smp::count
-    // Note, the split is not a guaratnee when the range can not be split anmore.
-    dht::token_range_vector tosplit;
-    while (ranges.size() < smp::count) {
-        size_t sz = ranges.size();
-        tosplit.clear();
-        ranges.swap(tosplit);
-        for (const auto& range : tosplit) {
-            split_and_add(ranges, range);
-        }
-        if (sz == ranges.size()) {
-            // We can not split the ranges anymore
-            break;
-        }
-    }
-
-    std::map<shard_id, dht::token_range_vector> shard_ranges_map;
-    unsigned idx = 0;
-    for (auto& range : ranges) {
-        shard_ranges_map[idx++ % smp::count].push_back(std::move(range));
-    }

    std::vector<future<>> repair_results;
-    repair_results.reserve(shard_ranges_map.size());
+    repair_results.reserve(smp::count);

-    for (auto& x : shard_ranges_map) {
-        shard_id shard = x.first;
-        auto& ranges = x.second;
-        auto f = db.invoke_on(shard, [keyspace, cfs, id, ranges = std::move(ranges),
+    for (auto shard : boost::irange(unsigned(0), smp::count)) {
+        auto f = db.invoke_on(shard, [keyspace, cfs, id, ranges,
                data_centers = options.data_centers, hosts = options.hosts] (database& localdb) mutable {
            return repair_ranges(repair_info(service::get_local_storage_service().db(),
                    std::move(keyspace), std::move(ranges), std::move(cfs),
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -46,6 +46,7 @@ thread_local seastar::thread_scheduling_group row_cache::_update_thread_scheduli

 mutation_reader
 row_cache::create_underlying_reader(read_context& ctx, mutation_source& src, const dht::partition_range& pr) {
+    ctx.on_underlying_created();
    return src(_schema, pr, ctx.slice(), ctx.pc(), ctx.trace_state(), streamed_mutation::forwarding::yes);
 }

@@ -74,7 +75,7 @@ cache_tracker::cache_tracker() {
            }
            evict_last(_lru);
            --_stats.partitions;
-            ++_stats.evictions;
+            ++_stats.partition_evictions;
            ++_stats.modification_count;
            return memory::reclaiming_result::reclaimed_something;
           } catch (std::bad_alloc&) {
@@ -98,15 +99,24 @@ cache_tracker::setup_metrics() {
    _metrics.add_group("cache", {
        sm::make_gauge("bytes_used", sm::description("current bytes used by the cache out of the total size of memory"), [this] { return _region.occupancy().used_space(); }),
        sm::make_gauge("bytes_total", sm::description("total size of memory for the cache"), [this] { return _region.occupancy().total_space(); }),
-        sm::make_derive("total_operations_hits", sm::description("total number of operation hits"), _stats.hits),
-        sm::make_derive("total_operations_misses", sm::description("total number of operation misses"), _stats.misses),
-        sm::make_derive("total_operations_insertions", sm::description("total number of operation insert"), _stats.insertions),
-        sm::make_derive("total_operations_concurrent_misses_same_key", sm::description("total number of operation with misses same key"), _stats.concurrent_misses_same_key),
-        sm::make_derive("total_operations_merges", sm::description("total number of operation merged"), _stats.merges),
-        sm::make_derive("total_operations_evictions", sm::description("total number of operation eviction"), _stats.evictions),
-        sm::make_derive("total_operations_removals", sm::description("total number of operation removals"), _stats.removals),
-        sm::make_derive("total_operations_mispopulations", sm::description("number of entries not inserted by reads"), _stats.mispopulations),
-        sm::make_gauge("objects_partitions", sm::description("total number of partition objects"), _stats.partitions)
+        sm::make_derive("partition_hits", sm::description("number of partitions needed by reads and found in cache"), _stats.partition_hits),
+        sm::make_derive("partition_misses", sm::description("number of partitions needed by reads and missing in cache"), _stats.partition_misses),
+        sm::make_derive("partition_insertions", sm::description("total number of partitions added to cache"), _stats.partition_insertions),
+        sm::make_derive("row_hits", sm::description("total number of rows needed by reads and found in cache"), _stats.row_hits),
+        sm::make_derive("row_misses", sm::description("total number of rows needed by reads and missing in cache"), _stats.row_misses),
+        sm::make_derive("row_insertions", sm::description("total number of rows added to cache"), _stats.row_insertions),
+        sm::make_derive("concurrent_misses_same_key", sm::description("total number of operation with misses same key"), _stats.concurrent_misses_same_key),
+        sm::make_derive("partition_merges", sm::description("total number of partitions merged"), _stats.partition_merges),
+        sm::make_derive("partition_evictions", sm::description("total number of evicted partitions"), _stats.partition_evictions),
+        sm::make_derive("partition_removals", sm::description("total number of invalidated partitions"), _stats.partition_removals),
+        sm::make_derive("mispopulations", sm::description("number of entries not inserted by reads"), _stats.mispopulations),
+        sm::make_gauge("partitions", sm::description("total number of cached partitions"), _stats.partitions),
+        sm::make_derive("reads", sm::description("number of started reads"), _stats.reads),
+        sm::make_derive("reads_with_misses", sm::description("number of reads which had to read from sstables"), _stats.reads_with_misses),
+        sm::make_gauge("active_reads", sm::description("number of currently active reads"), [this] { return _stats.active_reads(); }),
+        sm::make_derive("sstable_reader_recreations", sm::description("number of times sstable reader was recreated due to memtable flush"), _stats.underlying_recreations),
+        sm::make_derive("sstable_partition_skips", sm::description("number of times sstable reader was fast forwarded across partitions"), _stats.underlying_partition_skips),
+        sm::make_derive("sstable_row_skips", sm::description("number of times sstable reader was fast forwarded within a partition"), _stats.underlying_row_skips),
    });
 }

@@ -127,7 +137,7 @@ void cache_tracker::clear() {
        };
        clear(_lru);
    });
-    _stats.removals += _stats.partitions;
+    _stats.partition_removals += _stats.partitions;
    _stats.partitions = 0;
    ++_stats.modification_count;
 }
@@ -141,7 +151,7 @@ void cache_tracker::touch(cache_entry& e) {
 }

 void cache_tracker::insert(cache_entry& entry) {
-    ++_stats.insertions;
+    ++_stats.partition_insertions;
    ++_stats.partitions;
    ++_stats.modification_count;
    _lru.push_front(entry);
@@ -149,20 +159,28 @@ void cache_tracker::insert(cache_entry& entry) {

 void cache_tracker::on_erase() {
    --_stats.partitions;
-    ++_stats.removals;
+    ++_stats.partition_removals;
    ++_stats.modification_count;
 }

 void cache_tracker::on_merge() {
-    ++_stats.merges;
+    ++_stats.partition_merges;
 }

-void cache_tracker::on_hit() {
-    ++_stats.hits;
+void cache_tracker::on_partition_hit() {
+    ++_stats.partition_hits;
 }

-void cache_tracker::on_miss() {
-    ++_stats.misses;
+void cache_tracker::on_partition_miss() {
+    ++_stats.partition_misses;
+}
+
+void cache_tracker::on_row_hit() {
+    ++_stats.row_hits;
+}
+
+void cache_tracker::on_row_miss() {
+    ++_stats.row_misses;
 }

 void cache_tracker::on_mispopulate() {
@@ -348,14 +366,30 @@ void cache_tracker::clear_continuity(cache_entry& ce) {
    ce.set_continuous(false);
 }

-void row_cache::on_hit() {
-    _stats.hits.mark();
-    _tracker.on_hit();
+void row_cache::on_partition_hit() {
+    _tracker.on_partition_hit();
 }

-void row_cache::on_miss() {
+void row_cache::on_partition_miss() {
+    _tracker.on_partition_miss();
+}
+
+void row_cache::on_row_hit() {
+    _stats.hits.mark();
+    _tracker.on_row_hit();
+}
+
+void row_cache::on_mispopulate() {
+    _tracker.on_mispopulate();
+}
+
+void row_cache::on_row_miss() {
    _stats.misses.mark();
-    _tracker.on_miss();
+    _tracker.on_row_miss();
+}
+
+void row_cache::on_row_insert() {
+    ++_tracker._stats.row_insertions;
 }

 class range_populating_reader {
@@ -369,6 +403,7 @@ private:
    }
    void handle_end_of_stream() {
        if (!can_set_continuity()) {
+            _cache.on_mispopulate();
            return;
        }
        if (!_reader.range().end() || !_reader.range().end()->is_inclusive()) {
@@ -379,11 +414,15 @@ private:
                if (it == _cache._partitions.begin()) {
                    if (!_last_key->_key) {
                        it->set_continuous(true);
+                    } else {
+                        _cache.on_mispopulate();
                    }
                } else {
                    auto prev = std::prev(it);
                    if (prev->key().equal(*_cache._schema, *_last_key->_key)) {
                        it->set_continuous(true);
+                    } else {
+                        _cache.on_mispopulate();
                    }
                }
            }
@@ -403,17 +442,17 @@ public:
                    handle_end_of_stream();
                    return std::move(smopt);
                }
-                _cache.on_miss();
+                _cache.on_partition_miss();
                if (_reader.creation_phase() == _cache.phase_of(smopt->decorated_key())) {
                    return _cache._read_section(_cache._tracker.region(), [&] {
                        cache_entry& e = _cache.find_or_create(smopt->decorated_key(), smopt->partition_tombstone(), _reader.creation_phase(),
                            can_set_continuity() ? &*_last_key : nullptr);
-                        _last_key = smopt->decorated_key();
+                        _last_key = row_cache::previous_entry_pointer(smopt->decorated_key());
                        return e.read(_cache, _read_context, std::move(*smopt), _reader.creation_phase());
                    });
                } else {
                    _cache._tracker.on_mispopulate();
-                    _last_key = smopt->decorated_key();
+                    _last_key = row_cache::previous_entry_pointer(smopt->decorated_key());
                    return read_directly_from_underlying(std::move(*smopt), _read_context);
                }
            }
@@ -424,7 +463,7 @@ public:
        if (!pr.start()) {
            _last_key = row_cache::previous_entry_pointer();
        } else if (!pr.start()->is_inclusive() && pr.start()->value().has_key()) {
-            _last_key = pr.start()->value().as_decorated_key();
+            _last_key = row_cache::previous_entry_pointer(pr.start()->value().as_decorated_key());
        } else {
            // Inclusive start bound, cannot set continuity flag.
            _last_key = {};
@@ -448,7 +487,7 @@ private:
    streamed_mutation read_from_entry(cache_entry& ce) {
        _cache.upgrade_entry(ce);
        _cache._tracker.touch(ce);
-        _cache.on_hit();
+        _cache.on_partition_hit();
        return ce.read(_cache, *_read_context);
    }

@@ -469,7 +508,7 @@ private:
                    }
                    cache_entry& e = _primary.entry();
                    auto sm = read_from_entry(e);
-                    _lower_bound = {e.key(), false};
+                    _lower_bound = dht::partition_range::bound{e.key(), false};
                    // Delay the call to next() so that we don't see stale continuity on next invocation.
                    _advance_primary = true;
                    return streamed_mutation_opt(std::move(sm));
@@ -478,7 +517,7 @@ private:
                        cache_entry& e = _primary.entry();
                        _secondary_range = dht::partition_range(_lower_bound ? std::move(_lower_bound) : _pr->start(),
                            dht::partition_range::bound{e.key(), false});
-                        _lower_bound = {e.key(), true};
+                        _lower_bound = dht::partition_range::bound{e.key(), true};
                        _secondary_in_progress = true;
                        return stdx::nullopt;
                    } else {
@@ -487,7 +526,7 @@ private:
                        if (!range) {
                            return stdx::nullopt;
                        }
-                        _lower_bound = {dht::ring_position::max()};
+                        _lower_bound = dht::partition_range::bound{dht::ring_position::max()};
                        _secondary_range = std::move(*range);
                        _secondary_in_progress = true;
                        return stdx::nullopt;
@@ -570,10 +609,10 @@ row_cache::make_reader(schema_ptr s,
                cache_entry& e = *i;
                _tracker.touch(e);
                upgrade_entry(e);
-                on_hit();
+                on_partition_hit();
                return make_reader_returning(e.read(*this, *ctx));
            } else {
-                on_miss();
+                on_partition_miss();
                return make_mutation_reader<single_partition_populating_reader>(*this, std::move(ctx));
            }
          });
@@ -629,6 +668,8 @@ cache_entry& row_cache::do_find_or_create_entry(const dht::decorated_key& key,
                    || (previous->_key && i != _partitions.begin()
                        && std::prev(i)->key().equal(*_schema, *previous->_key))) {
                    i->set_continuous(true);
+                } else {
+                    on_mispopulate();
                }

                return *i;
@@ -642,6 +683,7 @@ cache_entry& row_cache::find_or_create(const dht::decorated_key& key, tombstone
        _tracker.insert(*entry);
        return _partitions.insert(i, *entry);
    }, [&] (auto i) { // visit
+        _tracker.on_miss_already_populated();
        cache_entry& e = *i;
        e.partition().open_version(*e.schema(), phase).partition().apply(t);
        _tracker.touch(e);
@@ -760,7 +802,7 @@ future<> row_cache::do_update(memtable& m, Updater updater) {
                            if (m.partitions.empty()) {
                                _prev_snapshot_pos = {};
                            } else {
-                                _prev_snapshot_pos = m.partitions.begin()->key();
+                                _prev_snapshot_pos = dht::ring_position(m.partitions.begin()->key());
                            }
                        });
                        STAP_PROBE1(scylla, row_cache_update_one_batch_end, quota_before - quota);
@@ -790,13 +832,12 @@ future<> row_cache::update(memtable& m, partition_presence_checker is_present) {
            entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema());
            _tracker.touch(entry);
            _tracker.on_merge();
-        } else if (is_present(mem_e.key()) == partition_presence_checker_result::definitely_doesnt_exist) {
+        } else if (cache_i->continuous() || is_present(mem_e.key()) == partition_presence_checker_result::definitely_doesnt_exist) {
            cache_entry* entry = current_allocator().construct<cache_entry>(
                mem_e.schema(), std::move(mem_e.key()), std::move(mem_e.partition()));
+            entry->set_continuous(cache_i->continuous());
            _tracker.insert(*entry);
            _partitions.insert(cache_i, *entry);
-        } else {
-            _tracker.clear_continuity(*cache_i);
        }
    });
 }
@@ -815,6 +856,10 @@ future<> row_cache::update_invalidating(memtable& m) {
    });
 }

+void row_cache::refresh_snapshot() {
+    _underlying = _snapshot_source();
+}
+
 void row_cache::touch(const dht::decorated_key& dk) {
 _read_section(_tracker.region(), [&] {
  with_linearized_managed_bytes([&] {
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -185,23 +185,35 @@ public:
    using lru_type = bi::list<cache_entry,
        bi::member_hook<cache_entry, cache_entry::lru_link_type, &cache_entry::_lru_link>,
        bi::constant_time_size<false>>; // we need this to have bi::auto_unlink on hooks.
-private:
-    // We will try to evict large partition after that many normal evictions
-    const uint32_t _normal_large_eviction_ratio = 1000;
-    // Number of normal evictions to perform before we try to evict large partition
-    uint32_t _normal_eviction_count = _normal_large_eviction_ratio;
 public:
+    friend class row_cache;
+    friend class cache::read_context;
+    friend class cache::autoupdating_underlying_reader;
+    friend class cache::cache_streamed_mutation;
    struct stats {
-        uint64_t hits;
-        uint64_t misses;
-        uint64_t insertions;
+        uint64_t partition_hits;
+        uint64_t partition_misses;
+        uint64_t row_hits;
+        uint64_t row_misses;
+        uint64_t partition_insertions;
+        uint64_t row_insertions;
        uint64_t concurrent_misses_same_key;
-        uint64_t merges;
-        uint64_t evictions;
-        uint64_t removals;
+        uint64_t partition_merges;
+        uint64_t partition_evictions;
+        uint64_t partition_removals;
        uint64_t partitions;
        uint64_t modification_count;
        uint64_t mispopulations;
+        uint64_t underlying_recreations;
+        uint64_t underlying_partition_skips;
+        uint64_t underlying_row_skips;
+        uint64_t reads;
+        uint64_t reads_with_misses;
+        uint64_t reads_done;
+
+        uint64_t active_reads() const {
+            return reads_done - reads;
+        }
    };
 private:
    stats _stats{};
@@ -219,8 +231,10 @@ public:
    void clear_continuity(cache_entry& ce);
    void on_erase();
    void on_merge();
-    void on_hit();
-    void on_miss();
+    void on_partition_hit();
+    void on_partition_miss();
+    void on_row_hit();
+    void on_row_miss();
    void on_miss_already_populated();
    void on_mispopulate();
    allocation_strategy& allocator();
@@ -263,6 +277,8 @@ public:
    struct stats {
        utils::timed_rate_moving_average hits;
        utils::timed_rate_moving_average misses;
+        utils::timed_rate_moving_average reads_with_misses;
+        utils::timed_rate_moving_average reads_with_no_misses;
    };
 private:
    cache_tracker& _tracker;
@@ -313,8 +329,12 @@ private:
    logalloc::allocating_section _read_section;
    mutation_reader create_underlying_reader(cache::read_context&, mutation_source&, const dht::partition_range&);
    mutation_reader make_scanning_reader(const dht::partition_range&, lw_shared_ptr<cache::read_context>);
-    void on_hit();
-    void on_miss();
+    void on_partition_hit();
+    void on_partition_miss();
+    void on_row_hit();
+    void on_row_miss();
+    void on_row_insert();
+    void on_mispopulate();
    void upgrade_entry(cache_entry&);
    void invalidate_locked(const dht::decorated_key&);
    void invalidate_unwrapped(const dht::partition_range&);
@@ -422,6 +442,10 @@ public:
    // as few elements as possible.
    future<> update_invalidating(memtable&);

+    // Refreshes snapshot. Must only be used if logical state in the underlying data
+    // source hasn't changed.
+    void refresh_snapshot();
+
    // Moves given partition to the front of LRU if present in cache.
    void touch(const dht::decorated_key&);

@@ -449,7 +473,7 @@ public:
    // If it did, use invalidate() instead.
    void evict(const dht::partition_range& = query::full_partition_range);

-    auto num_entries() const {
+    size_t partitions() const {
        return _partitions.size();
    }
    const cache_tracker& get_cache_tracker() const {
--- a/schema.cc
+++ b/schema.cc
@@ -105,6 +105,97 @@ schema::make_column_specification(const column_definition& def) {
    return ::make_shared<cql3::column_specification>(_raw._ks_name, _raw._cf_name, std::move(id), def.type);
 }

+v3_columns::v3_columns(std::vector<column_definition> cols, bool is_dense, bool is_compound)
+    : _is_dense(is_dense)
+    , _is_compound(is_compound)
+    , _columns(std::move(cols))
+{
+    for (column_definition& def : _columns) {
+        _columns_by_name[def.name()] = &def;
+    }
+}
+
+v3_columns v3_columns::from_v2_schema(const schema& s) {
+    data_type static_column_name_type = utf8_type;
+    std::vector<column_definition> cols;
+
+    if (s.is_static_compact_table()) {
+        if (s.has_static_columns()) {
+            throw std::runtime_error(
+                sprint("v2 static compact table should not have static columns: %s.%s", s.ks_name(), s.cf_name()));
+        }
+        if (s.clustering_key_size()) {
+            throw std::runtime_error(
+                sprint("v2 static compact table should not have clustering columns: %s.%s", s.ks_name(), s.cf_name()));
+        }
+        static_column_name_type = s.regular_column_name_type();
+        for (auto& c : s.all_columns()) {
+            // Note that for "static" no-clustering compact storage we use static for the defined columns
+            if (c.kind == column_kind::regular_column) {
+                auto new_def = c;
+                new_def.kind = column_kind::static_column;
+                cols.push_back(new_def);
+            } else {
+                cols.push_back(c);
+            }
+        }
+        schema_builder::default_names names(s._raw);
+        cols.emplace_back(to_bytes(names.clustering_name()), static_column_name_type, column_kind::clustering_key, 0);
+        cols.emplace_back(to_bytes(names.compact_value_name()), s.make_legacy_default_validator(), column_kind::regular_column, 0);
+    } else {
+        cols = s.all_columns();
+    }
+
+    for (column_definition& def : cols) {
+        data_type name_type = def.is_static() ? static_column_name_type : utf8_type;
+        auto id = ::make_shared<cql3::column_identifier>(def.name(), name_type);
+        def.column_specification = ::make_shared<cql3::column_specification>(s.ks_name(), s.cf_name(), std::move(id), def.type);
+    }
+
+    return v3_columns(std::move(cols), s.is_dense(), s.is_compound());
+}
+
+void v3_columns::apply_to(schema_builder& builder) const {
+    if (is_static_compact()) {
+        for (auto& c : _columns) {
+            if (c.kind == column_kind::regular_column) {
+                builder.set_default_validation_class(c.type);
+            } else if (c.kind == column_kind::static_column) {
+                auto new_def = c;
+                new_def.kind = column_kind::regular_column;
+                builder.with_column(new_def);
+            } else if (c.kind == column_kind::clustering_key) {
+                builder.set_regular_column_name_type(c.type);
+            } else {
+                builder.with_column(c);
+            }
+        }
+    } else {
+        for (auto& c : _columns) {
+            if (is_compact() && c.kind == column_kind::regular_column) {
+                builder.set_default_validation_class(c.type);
+            }
+            builder.with_column(c);
+        }
+    }
+}
+
+bool v3_columns::is_static_compact() const {
+    return !_is_dense && !_is_compound;
+}
+
+bool v3_columns::is_compact() const {
+    return _is_dense || !_is_compound;
+}
+
+const std::unordered_map<bytes, const column_definition*>& v3_columns::columns_by_name() const {
+    return _columns_by_name;
+}
+
+const std::vector<column_definition>& v3_columns::all_columns() const {
+    return _columns;
+}
+
 void schema::rebuild() {
    _partition_key_type = make_lw_shared<compound_type<>>(get_column_types(partition_key_columns()));
    _clustering_key_type = make_lw_shared<compound_prefix>(get_column_types(clustering_key_columns()));
@@ -117,10 +208,10 @@ void schema::rebuild() {
    }

    static_assert(row_column_ids_are_ordered_by_name::value, "row columns don't need to be ordered by name");
-    if (!std::is_sorted(regular_columns().begin(), regular_columns().end(), column_definition::name_comparator())) {
+    if (!std::is_sorted(regular_columns().begin(), regular_columns().end(), column_definition::name_comparator(regular_column_name_type()))) {
        throw std::runtime_error("Regular columns should be sorted by name");
    }
-    if (!std::is_sorted(static_columns().begin(), static_columns().end(), column_definition::name_comparator())) {
+    if (!std::is_sorted(static_columns().begin(), static_columns().end(), column_definition::name_comparator(static_column_name_type()))) {
        throw std::runtime_error("Static columns should be sorted by name");
    }

@@ -137,7 +228,7 @@ void schema::rebuild() {
    }

    thrift()._compound = is_compound();
-    thrift()._is_dynamic = static_columns_count() == 0;
+    thrift()._is_dynamic = clustering_key_size() > 0;

    if (is_counter()) {
        for (auto&& cdef : boost::range::join(static_columns(), regular_columns())) {
@@ -152,6 +243,8 @@ void schema::rebuild() {
            }
        }
    }
+
+    _v3_columns = v3_columns::from_v2_schema(*this);
 }

 const column_mapping& schema::get_column_mapping() const {
@@ -189,24 +282,15 @@ schema::schema(const raw_schema& raw, stdx::optional<raw_view_info> raw_view_inf
    }())
    , _regular_columns_by_name(serialized_compare(_raw._regular_column_name_type))
 {
-    struct name_compare {
-        data_type type;
-        name_compare(data_type type) : type(type) {}
-        bool operator()(const column_definition& cd1, const column_definition& cd2) const {
-            return type->less(cd1.name(), cd2.name());
-        }
-    };
-
-
    std::sort(
            _raw._columns.begin() + column_offset(column_kind::static_column),
            _raw._columns.begin()
                    + column_offset(column_kind::regular_column),
-            name_compare(utf8_type));
+            column_definition::name_comparator(static_column_name_type()));
    std::sort(
            _raw._columns.begin()
                    + column_offset(column_kind::regular_column),
-            _raw._columns.end(), name_compare(regular_column_name_type()));
+            _raw._columns.end(), column_definition::name_comparator(regular_column_name_type()));

    std::sort(_raw._columns.begin(),
              _raw._columns.begin() + column_offset(column_kind::clustering_key),
@@ -360,6 +444,7 @@ bool operator==(const schema& x, const schema& y)
        && x._raw._speculative_retry == y._raw._speculative_retry
        && x._raw._compaction_strategy == y._raw._compaction_strategy
        && x._raw._compaction_strategy_options == y._raw._compaction_strategy_options
+        && x._raw._compaction_enabled == y._raw._compaction_enabled
        && x._raw._caching_options == y._raw._caching_options
        && x._raw._dropped_columns == y._raw._dropped_columns
        && x._raw._collections == y._raw._collections
@@ -478,11 +563,10 @@ std::ostream& operator<<(std::ostream& os, const schema& s) {
    os << ",compactionStrategyOptions={";
    n = 0;
    for (auto& p : s._raw._compaction_strategy_options) {
-        if (n++ != 0) {
-            os << ", ";
-        }
        os << p.first << "=" << p.second;
+        os << ", ";
    }
+    os << "enabled=" << std::boolalpha << s._raw._compaction_enabled;
    os << "}";
    os << ",compressionParameters={";
    n = 0;
@@ -500,7 +584,6 @@ std::ostream& operator<<(std::ostream& os, const schema& s) {
    os << ",minIndexInterval=" << s._raw._min_index_interval;
    os << ",maxIndexInterval=" << s._raw._max_index_interval;
    os << ",speculativeRetry=" << s._raw._speculative_retry.to_sstring();
-    os << ",droppedColumns={}";
    os << ",triggers=[]";
    os << ",isDense=" << std::boolalpha << s._raw._is_dense;
    os << ",version=" << s.version();
@@ -642,11 +725,7 @@ schema_builder& schema_builder::without_column(bytes name)
        return column.name() == name;
    });
    assert(it != _raw._columns.end());
-    auto now = api::new_timestamp();
-    auto ret = _raw._dropped_columns.emplace(it->name_as_text(), schema::dropped_column{it->type, now});
-    if (!ret.second) {
-        ret.first->second.timestamp = std::max(ret.first->second.timestamp, now);
-    }
+    without_column(it->name_as_text(), it->type, api::new_timestamp());
    _raw._columns.erase(it);
    return *this;
 }
@@ -658,8 +737,9 @@ schema_builder& schema_builder::without_column(sstring name, api::timestamp_type
 schema_builder& schema_builder::without_column(sstring name, data_type type, api::timestamp_type timestamp)
 {
    auto ret = _raw._dropped_columns.emplace(name, schema::dropped_column{type, timestamp});
-    if (!ret.second) {
-        ret.first->second.timestamp = std::max(ret.first->second.timestamp, timestamp);
+    if (!ret.second && ret.first->second.timestamp < timestamp) {
+        ret.first->second.type = type;
+        ret.first->second.timestamp = timestamp;
    }
    return *this;
 }
@@ -751,10 +831,8 @@ sstring schema_builder::default_names::compact_value_name() {
 void schema_builder::prepare_dense_schema(schema::raw_schema& raw) {
    auto is_dense = raw._is_dense;
    auto is_compound = raw._is_compound;
-    auto is_static_compact = !is_dense && !is_compound;
    auto is_compact_table = is_dense || !is_compound;

-
    if (is_compact_table) {
        auto count_kind = [&raw](column_kind kind) {
            return std::count_if(raw._columns.begin(), raw._columns.end(), [kind](const column_definition& c) {
@@ -764,37 +842,7 @@ void schema_builder::prepare_dense_schema(schema::raw_schema& raw) {

        default_names names(raw);

-        if (is_static_compact) {
-            /**
-             * In origin v3 the general cql-ification of the "storage engine" means
-             * that "static compact" tables are expressed as all defined columns static,
-             * but with synthetic clustering + regular columns.
-             * We unfortunately need to play along with this, both because we want
-             * schema tables on disk to be compatible (and they are explicit).
-             * More to the point, we are, at least until we upgrade to version "m"
-             * sstables, stuck with having origins java tools reading our schema tables
-             * for table schemas (this btw applies to db drivers too, though maybe a little
-             * less), and it asserts badly if we don't uphold the origin table tweaks.
-             *
-             * So transform away...
-             *
-             */
-            if (!count_kind(column_kind::static_column)) {
-                assert(!count_kind(column_kind::clustering_key));
-                for (auto& c : raw._columns) {
-                    // Note that for "static" no-clustering compact storage we use static for the defined columns
-                    if (c.kind == column_kind::regular_column) {
-                        c.kind = column_kind::static_column;
-                    }
-                }
-                // Compact tables always have a clustering and a single regular value.
-                raw._columns.emplace_back(to_bytes(names.clustering_name()),
-                                utf8_type, column_kind::clustering_key, 0);
-                raw._columns.emplace_back(to_bytes(names.compact_value_name()),
-                                raw._is_counter ? counter_type : bytes_type,
-                                column_kind::regular_column, 0);
-            }
-        } else if (is_dense) {
+        if (is_dense) {
            auto regular_cols = count_kind(column_kind::regular_column);
            // In Origin, dense CFs always have at least one regular column
            if (regular_cols == 0) {
@@ -838,6 +886,10 @@ schema_ptr schema_builder::build() {
        new_raw._version = utils::UUID_gen::get_time_UUID();
    }

+    if (new_raw._is_counter) {
+        new_raw._default_validation_class = counter_type;
+    }
+
    if (_compact_storage) {
        // Dense means that no part of the comparator stores a CQL column name. This means
        // COMPACT STORAGE with at least one columnAliases (otherwise it's a thrift "static" CF).
@@ -1032,7 +1084,10 @@ schema::static_upper_bound(const bytes& name) const {
 }
 data_type
 schema::column_name_type(const column_definition& def) const {
-    return def.kind == column_kind::regular_column ? _raw._regular_column_name_type : utf8_type;
+    if (def.kind == column_kind::regular_column) {
+        return _raw._regular_column_name_type;
+    }
+    return utf8_type;
 }

 const column_definition&
@@ -1043,6 +1098,14 @@ schema::regular_column_at(column_id id) const {
    return _raw._columns.at(column_offset(column_kind::regular_column) + id);
 }

+const column_definition&
+schema::clustering_column_at(column_id id) const {
+    if (id >= clustering_key_size()) {
+        throw std::out_of_range(sprint("clustering column id %d >= %d", id, clustering_key_size()));
+    }
+    return _raw._columns.at(column_offset(column_kind::clustering_key) + id);
+}
+
 const column_definition&
 schema::static_column_at(column_id id) const {
    if (id > static_columns_count()) {
@@ -1119,12 +1182,8 @@ schema::select_order_range schema::all_columns_in_select_order() const {
                    _raw._columns.begin() + (is_static_compact_table ?
                                    column_offset(column_kind::clustering_key) :
                                    column_offset(column_kind::static_column)));
-    auto ck_v_range =
-                    (is_static_compact_table || no_non_pk_columns) ?
-                                    static_columns() :
-                                    const_iterator_range_type(
-                                                    static_columns().begin(),
-                                                    all_columns().end());
+    auto ck_v_range = no_non_pk_columns ? static_columns()
+                                        : const_iterator_range_type(static_columns().begin(), all_columns().end());
    return boost::range::join(pk_range, ck_v_range);
 }

@@ -1163,23 +1222,7 @@ std::vector<sstring> schema::index_names() const {
 }

 data_type schema::make_legacy_default_validator() const {
-    if (is_counter()) {
-        return counter_type;
-    }
-    if (is_compact_table()) {
-        // See CFMetaData.
-        if (is_super()) {
-            for (auto& c : regular_columns()) {
-                if (c.name().empty()) {
-                    return c.type;
-                }
-            }
-            assert("Invalid super column table definition, no 'dynamic' map column");
-        } else {
-            return regular_columns().begin()->type;
-        }
-    }
-    return bytes_type;
+    return _raw._default_validation_class;
 }

 bool schema::is_synced() const {
--- a/schema.hh
+++ b/schema.hh
@@ -193,8 +193,10 @@ public:
 class column_definition final {
 public:
    struct name_comparator {
-        bool operator()(const column_definition& d1, const column_definition& d2) const {
-            return d1.name() < d2.name();
+        data_type type;
+        name_comparator(data_type type) : type(type) {}
+        bool operator()(const column_definition& cd1, const column_definition& cd2) const {
+            return type->less(cd1.name(), cd2.name());
        }
    };
 private:
@@ -234,6 +236,7 @@ public:
    bool is_clustering_key() const { return kind == column_kind::clustering_key; }
    bool is_primary_key() const { return kind == column_kind::partition_key || kind == column_kind::clustering_key; }
    bool is_atomic() const { return _is_atomic; }
+    bool is_multi_cell() const { return !_is_atomic; }
    bool is_counter() const { return _is_counter; }
    const sstring& name_as_text() const;
    const bytes& name() const;
@@ -378,12 +381,40 @@ std::ostream& operator<<(std::ostream& os, const raw_view_info& view);

 class view_info;

+// Represents a column set which is compactible with Cassandra 3.x.
+//
+// This layout differs from the layout Scylla uses in schema/schema_builder for static compact tables.
+// For such tables, Scylla expects all columns to be of regular type and no clustering columns,
+// whereas in v3 those columns are static and there is a clustering column with type matching the
+// cell name comparator and a regular column with type matching the default validator.
+// See issues #2555 and #1474.
+class v3_columns {
+    bool _is_dense = false;
+    bool _is_compound = false;
+    std::vector<column_definition> _columns;
+    std::unordered_map<bytes, const column_definition*> _columns_by_name;
+public:
+    v3_columns(std::vector<column_definition> columns, bool is_dense, bool is_compound);
+    v3_columns() = default;
+    v3_columns(v3_columns&&) = default;
+    v3_columns& operator=(v3_columns&&) = default;
+    v3_columns(const v3_columns&) = delete;
+    static v3_columns from_v2_schema(const schema&);
+public:
+    const std::vector<column_definition>& all_columns() const;
+    const std::unordered_map<bytes, const column_definition*>& columns_by_name() const;
+    bool is_static_compact() const;
+    bool is_compact() const;
+    void apply_to(schema_builder&) const;
+};
+
 /*
 * Effectively immutable.
 * Not safe to access across cores because of shared_ptr's.
 * Use global_schema_ptr for safe across-shard access.
 */
 class schema final : public enable_lw_shared_from_this<schema> {
+    friend class v3_columns;
 public:
    struct dropped_column {
        data_type type;
@@ -406,6 +437,7 @@ private:
        sstring _comment;
        gc_clock::duration _default_time_to_live = gc_clock::duration::zero();
        data_type _regular_column_name_type;
+        data_type _default_validation_class = bytes_type;
        double _bloom_filter_fp_chance = 0.01;
        compression_parameters _compressor_params;
        bool _is_dense = false;
@@ -426,6 +458,7 @@ private:
        // we will use by default - when we have the choice.
        sstables::compaction_strategy_type _compaction_strategy = sstables::compaction_strategy_type::size_tiered;
        std::map<sstring, sstring> _compaction_strategy_options;
+        bool _compaction_enabled = true;
        caching_options _caching_options;
        table_schema_version _version;
        std::unordered_map<sstring, dropped_column> _dropped_columns;
@@ -434,6 +467,7 @@ private:
    };
    raw_schema _raw;
    thrift_schema _thrift;
+    v3_columns _v3_columns;
    mutable schema_registry_entry* _registry_entry = nullptr;
    std::unique_ptr<::view_info> _view_info;

@@ -570,14 +604,22 @@ public:
        return _raw._memtable_flush_period;
    }

-    sstables::compaction_strategy_type compaction_strategy() const {
+    sstables::compaction_strategy_type configured_compaction_strategy() const {
        return _raw._compaction_strategy;
    }

+    sstables::compaction_strategy_type compaction_strategy() const {
+        return _raw._compaction_enabled ? _raw._compaction_strategy : sstables::compaction_strategy_type::null;
+    }
+
    const std::map<sstring, sstring>& compaction_strategy_options() const {
        return _raw._compaction_strategy_options;
    }

+    bool compaction_enabled() const {
+        return _raw._compaction_enabled;
+    }
+
    const ::speculative_retry& speculative_retry() const {
        return _raw._speculative_retry;
    }
@@ -597,6 +639,7 @@ public:
    const_iterator static_lower_bound(const bytes& name) const;
    const_iterator static_upper_bound(const bytes& name) const;
    data_type column_name_type(const column_definition& def) const;
+    const column_definition& clustering_column_at(column_id id) const;
    const column_definition& regular_column_at(column_id id) const;
    const column_definition& static_column_at(column_id id) const;
    bool is_last_partition_key(const column_definition& def) const;
@@ -662,6 +705,9 @@ public:
    const data_type& regular_column_name_type() const {
        return _raw._regular_column_name_type;
    }
+    const data_type& static_column_name_type() const {
+        return utf8_type;
+    }
    const std::unique_ptr<::view_info>& view_info() const {
        return _view_info;
    }
@@ -689,6 +735,10 @@ public:
    // recent as this version.
    bool is_synced() const;
    bool equal_columns(const schema&) const;
+public:
+    const v3_columns& v3() const {
+        return _v3_columns;
+    }
 };

 bool operator==(const schema&, const schema&);
--- a/schema_builder.hh
+++ b/schema_builder.hh
@@ -50,6 +50,10 @@ public:
        _raw._regular_column_name_type = t;
        return *this;
    }
+    schema_builder& set_default_validation_class(const data_type& t) {
+        _raw._default_validation_class = t;
+        return *this;
+    }
    const data_type& regular_column_name_type() const {
        return _raw._regular_column_name_type;
    }
@@ -128,6 +132,15 @@ public:
        return _raw._max_compaction_threshold;
    }

+    schema_builder& set_compaction_enabled(bool enabled) {
+        _raw._compaction_enabled = enabled;
+        return *this;
+    }
+
+    bool compaction_enabled() const {
+        return _raw._compaction_enabled;
+    }
+
    schema_builder& set_min_index_interval(int32_t t) {
        _raw._min_index_interval = t;
        return *this;
@@ -246,6 +259,10 @@ public:
    schema_builder& with_index(const index_metadata& im);
    schema_builder& without_index(const sstring& name);

+    default_names get_default_names() const {
+        return default_names(_raw);
+    }
+
    // Equivalent to with(cp).build()
    schema_ptr build(compact_storage cp);

--- a/schema_mutations.cc
+++ b/schema_mutations.cc
@@ -28,11 +28,13 @@ schema_mutations::schema_mutations(canonical_mutation columnfamilies,
                                   canonical_mutation columns,
                                   bool is_view,
                                   stdx::optional<canonical_mutation> indices,
-                                   stdx::optional<canonical_mutation> dropped_columns)
+                                   stdx::optional<canonical_mutation> dropped_columns,
+                                   stdx::optional<canonical_mutation> scylla_tables)
    : _columnfamilies(columnfamilies.to_mutation(is_view ? db::schema_tables::views() : db::schema_tables::tables()))
    , _columns(columns.to_mutation(db::schema_tables::columns()))
    , _indices(indices ? stdx::optional<mutation>{indices.value().to_mutation(db::schema_tables::indexes())} : stdx::nullopt)
    , _dropped_columns(dropped_columns ? stdx::optional<mutation>{dropped_columns.value().to_mutation(db::schema_tables::dropped_columns())} : stdx::nullopt)
+    , _scylla_tables(scylla_tables ? stdx::optional<mutation>{scylla_tables.value().to_mutation(db::schema_tables::scylla_tables())} : stdx::nullopt)
 {}

 void schema_mutations::copy_to(std::vector<mutation>& dst) const {
@@ -44,9 +46,25 @@ void schema_mutations::copy_to(std::vector<mutation>& dst) const {
    if (_dropped_columns) {
        dst.push_back(_dropped_columns.value());
    }
+    if (_scylla_tables) {
+        dst.push_back(_scylla_tables.value());
+    }
 }

 table_schema_version schema_mutations::digest() const {
+    if (_scylla_tables) {
+        auto rs = query::result_set(*_scylla_tables);
+        if (!rs.empty()) {
+            auto&& row = rs.row(0);
+            if (row.has("version")) {
+                auto val = row.get<utils::UUID>("version");
+                if (val) {
+                    return *val;
+                }
+            }
+        }
+    }
+
    md5_hasher h;
    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies);
    db::schema_tables::feed_hash_for_schema_digest(h, _columns);
@@ -56,14 +74,29 @@ table_schema_version schema_mutations::digest() const {
    if (_dropped_columns && !_dropped_columns.value().partition().empty()) {
        db::schema_tables::feed_hash_for_schema_digest(h, _dropped_columns.value());
    }
+    if (_scylla_tables) {
+        db::schema_tables::feed_hash_for_schema_digest(h, _scylla_tables.value());
+    }
    return utils::UUID_gen::get_name_UUID(h.finalize());
 }

+static stdx::optional<mutation> compact(const stdx::optional<mutation>& m) {
+    if (!m) {
+        return m;
+    }
+    return db::schema_tables::compact_for_schema_digest(*m);
+}
+
+static stdx::optional<mutation> compact(const mutation& m) {
+    return db::schema_tables::compact_for_schema_digest(m);
+}
+
 bool schema_mutations::operator==(const schema_mutations& other) const {
-    return _columnfamilies == other._columnfamilies
-           && _columns == other._columns
-           && _indices == other._indices
-           && _dropped_columns == other._dropped_columns
+    return compact(_columnfamilies) == compact(other._columnfamilies)
+           && compact(_columns) == compact(other._columns)
+           && compact(_indices) == compact(other._indices)
+           && compact(_dropped_columns) == compact(other._dropped_columns)
+           && compact(_scylla_tables) == compact(other._scylla_tables)
           ;
 }

--- a/schema_mutations.hh
+++ b/schema_mutations.hh
@@ -27,23 +27,28 @@
 #include "canonical_mutation.hh"

 // Commutative representation of table schema
+// Equality ignores tombstones.
 class schema_mutations {
    mutation _columnfamilies;
    mutation _columns;
    stdx::optional<mutation> _indices;
    stdx::optional<mutation> _dropped_columns;
+    stdx::optional<mutation> _scylla_tables;
 public:
-    schema_mutations(mutation columnfamilies, mutation columns, stdx::optional<mutation> indices, stdx::optional<mutation> dropped_columns)
+    schema_mutations(mutation columnfamilies, mutation columns, stdx::optional<mutation> indices, stdx::optional<mutation> dropped_columns,
+        stdx::optional<mutation> scylla_tables)
            : _columnfamilies(std::move(columnfamilies))
            , _columns(std::move(columns))
            , _indices(std::move(indices))
            , _dropped_columns(std::move(dropped_columns))
+            , _scylla_tables(std::move(scylla_tables))
    { }
    schema_mutations(canonical_mutation columnfamilies,
                     canonical_mutation columns,
                     bool is_view,
                     stdx::optional<canonical_mutation> indices,
-                     stdx::optional<canonical_mutation> dropped_columns);
+                     stdx::optional<canonical_mutation> dropped_columns,
+                     stdx::optional<canonical_mutation> scylla_tables);

    schema_mutations(schema_mutations&&) = default;
    schema_mutations& operator=(schema_mutations&&) = default;
@@ -60,6 +65,10 @@ public:
        return _columns;
    }

+    const stdx::optional<mutation>& scylla_tables() const {
+        return _scylla_tables;
+    }
+
    const stdx::optional<mutation>& indices_mutation() const {
        return _indices;
    }
@@ -87,6 +96,12 @@ public:
        }
        return {};
    }
+    stdx::optional<canonical_mutation> scylla_tables_canonical_mutation() const {
+        if (_scylla_tables) {
+            return canonical_mutation(_scylla_tables.value());
+        }
+        return {};
+    }

    bool is_view() const;

--- a/schema_registry.cc
+++ b/schema_registry.cc
@@ -48,7 +48,17 @@ schema_registry_entry::schema_registry_entry(table_schema_version v, schema_regi
    , _version(v)
    , _registry(r)
    , _sync_state(sync_state::NOT_SYNCED)
-{ }
+{
+    _erase_timer.set_callback([this] {
+        slogger.debug("Dropping {}", _version);
+        assert(!_schema);
+        try {
+            _registry._entries.erase(_version);
+        } catch (...) {
+            slogger.error("Failed to erase schema version {}: {}", _version, std::current_exception());
+        }
+    });
+}

 schema_ptr schema_registry::learn(const schema_ptr& s) {
    if (s->registry_entry()) {
@@ -173,6 +183,7 @@ schema_ptr schema_registry_entry::get_schema() {
        if (s->version() != _version) {
            throw std::runtime_error(sprint("Unfrozen schema version doesn't match entry version (%s): %s", _version, *s));
        }
+        _erase_timer.cancel();
        s->_registry_entry = this;
        _schema = &*s;
        return s;
@@ -184,12 +195,7 @@ schema_ptr schema_registry_entry::get_schema() {
 void schema_registry_entry::detach_schema() noexcept {
    slogger.trace("Deactivating {}", _version);
    _schema = nullptr;
-    // TODO: keep the entry for a while (timer)
-    try {
-        _registry._entries.erase(_version);
-    } catch (...) {
-        slogger.error("Failed to erase schema version {}: {}", _version, std::current_exception());
-    }
+    _erase_timer.arm(_registry.grace_period());
 }

 frozen_schema schema_registry_entry::frozen() const {
@@ -273,9 +279,9 @@ schema_ptr global_schema_ptr::get() const {
            s = local_schema_registry().get_or_load(e.version(), [&e](table_schema_version) {
                return e.frozen();
            });
-            if (e.is_synced()) {
-                s->registry_entry()->mark_synced();
-            }
+        }
+        if (e.is_synced()) {
+            s->registry_entry()->mark_synced();
        }
        return s;
    }
--- a/schema_registry.hh
+++ b/schema_registry.hh
@@ -55,6 +55,8 @@ public:
 // In addition to the above the entry is controlled by lw_shared_ptr<> to cope with races between loaders.
 //
 class schema_registry_entry : public enable_lw_shared_from_this<schema_registry_entry> {
+    using erase_clock = seastar::lowres_clock;
+
    enum class state {
        INITIAL, LOADING, LOADED
    };
@@ -74,6 +76,7 @@ class schema_registry_entry : public enable_lw_shared_from_this<schema_registry_
    enum class sync_state { NOT_SYNCED, SYNCING, SYNCED };
    sync_state _sync_state;
    shared_promise<> _synced_promise; // valid when _sync_state == SYNCING
+    timer<erase_clock> _erase_timer;

    friend class schema_registry;
 public:
@@ -110,6 +113,11 @@ class schema_registry {
    std::unordered_map<table_schema_version, lw_shared_ptr<schema_registry_entry>> _entries;
    friend class schema_registry_entry;
    schema_registry_entry& get_entry(table_schema_version) const;
+    // Duration for which unused entries are kept alive to avoid
+    // too frequent re-requests and syncs.
+    schema_registry_entry::erase_clock::duration grace_period() const {
+        return std::chrono::seconds(1);
+    }
 public:
    // Looks up schema by version or loads it using supplied loader.
    schema_ptr get_or_load(table_schema_version, const schema_loader&);
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -190,7 +190,7 @@ class scylla_column_families(gdb.Command):
            db = find_db(shard)
            cfs = db['_column_families']
            for (key, value) in list_unordered_map(cfs):
-                value = value['_p']['_value']  # it's a lw_shared_ptr
+                value = value['_p'].reinterpret_cast(gdb.lookup_type('column_family').pointer()).dereference()  # it's a lw_shared_ptr
                schema = value['_schema']['_p'].reinterpret_cast(gdb.lookup_type('schema').pointer())
                name = str(schema['_raw']['_ks_name']) + '/' + str(schema['_raw']['_cf_name'])
                schema_version = str(schema['_raw']['_version'])
--- a/2
+++ b/2
--- a/service/client_state.cc
+++ b/service/client_state.cc
@@ -133,7 +133,7 @@ future<> service::client_state::has_access(const sstring& ks, auth::permission p
        // prevent system keyspace modification
        auto name = ks;
        std::transform(name.begin(), name.end(), name.begin(), ::tolower);
-        if (name == db::system_keyspace::NAME) {
+        if (is_system_keyspace(name)) {
            throw exceptions::unauthorized_exception(ks + " keyspace is not user-modifiable.");
        }

--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -86,7 +86,12 @@ void migration_manager::init_messaging_service()
        });
        return netw::messaging_service::no_wait();
    });
-    ms.register_migration_request([this] () {
+    ms.register_migration_request([this] (const rpc::client_info& cinfo) {
+        auto src = netw::messaging_service::get_source(cinfo);
+        if (!has_compatible_schema_tables_version(src.addr)) {
+            mlogger.debug("Ignoring schema request from incompatible node: {}", src);
+            return make_ready_future<std::vector<frozen_mutation>>(std::vector<frozen_mutation>());
+        }
        return db::schema_tables::convert_schema_to_mutations(get_storage_proxy()).finally([p = get_local_shared_storage_proxy()] {
            // keep local proxy alive
        });
@@ -133,12 +138,15 @@ bool migration_manager::is_ready_for_bootstrap() {
        if (endpoint == utils::fb_utilities::get_broadcast_address() || !eps.is_alive()) {
            continue;
        }
+        mlogger.debug("Checking schema state for {}.", endpoint);
        auto schema = eps.get_application_state(gms::application_state::SCHEMA);
        if (!schema) {
+            mlogger.debug("Schema state not yet available for {}.", endpoint);
            return false;
        }
        utils::UUID remote_version{schema->value};
        if (our_version != remote_version) {
+            mlogger.debug("Schema mismatch for {} ({} != {}).", endpoint, our_version, remote_version);
            return false;
        } else {
            match = true;
@@ -220,15 +228,18 @@ future<> migration_manager::merge_schema_from(netw::messaging_service::msg_addr
    });
 }

-bool migration_manager::should_pull_schema_from(const gms::inet_address& endpoint)
-{
-    /*
-     * Don't request schema from nodes with a differnt or unknonw major version (may have incompatible schema)
-     * Don't request schema from fat clients
-     */
-    auto& ms = netw::get_local_messaging_service();
-    return ms.knows_version(endpoint)
-            && ms.get_raw_version(endpoint) == netw::messaging_service::current_version
+bool migration_manager::has_compatible_schema_tables_version(const gms::inet_address& endpoint) {
+    auto& gossiper = gms::get_local_gossiper();
+    auto ep_state = gossiper.get_endpoint_state_for_endpoint(endpoint);
+    if (!ep_state) {
+        return false;
+    }
+    auto&& version_opt = ep_state->get_application_state(gms::application_state::SCHEMA_TABLES_VERSION);
+    return version_opt && version_opt->value == db::schema_tables::version;
+}
+
+bool migration_manager::should_pull_schema_from(const gms::inet_address& endpoint) {
+    return has_compatible_schema_tables_version(endpoint)
            && !gms::get_local_gossiper().is_gossip_only_member(endpoint);
 }

--- a/service/migration_manager.hh
+++ b/service/migration_manager.hh
@@ -94,6 +94,7 @@ public:
    future<> notify_drop_view(const view_ptr& view);

    bool should_pull_schema_from(const gms::inet_address& endpoint);
+    bool has_compatible_schema_tables_version(const gms::inet_address& endpoint);

    future<> announce_keyspace_update(lw_shared_ptr<keyspace_metadata> ksm, bool announce_locally = false);

--- a/service/misc_services.cc
+++ b/service/misc_services.cc
@@ -127,7 +127,7 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
        return boost::copy_range<std::unordered_map<utils::UUID, stat>>(db.get_column_families() | boost::adaptors::filtered(non_system_filter) |
                boost::adaptors::transformed([]  (const std::pair<utils::UUID, lw_shared_ptr<column_family>>& cf) {
            auto& stats = cf.second->get_row_cache().stats();
-            return std::make_pair(cf.first, stat{float(stats.hits.rate().rates[0]), float(stats.misses.rate().rates[0])});
+            return std::make_pair(cf.first, stat{float(stats.reads_with_no_misses.rate().rates[0]), float(stats.reads_with_misses.rate().rates[0])});
        }));
    };

--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -2075,6 +2075,7 @@ private:
                break;
            }
        }
+        assert(last_partition);
        return get_last_row(s, *last_partition, is_reversed);
    }

@@ -2300,6 +2301,10 @@ public:
                    v.emplace_back(r.from, stdx::optional<partition>(), r.reached_end, true);
                }
            }
+
+            boost::sort(v, [] (const version& x, const version& y) {
+                return x.from < y.from;
+            });
        } while(true);

        std::vector<mutation_and_live_row_count> reconciled_partitions;
@@ -2308,7 +2313,10 @@ public:
        // reconcile all versions
        boost::range::transform(boost::make_iterator_range(versions.begin(), versions.end()), std::back_inserter(reconciled_partitions),
                                [this, schema, original_per_partition_limit] (std::vector<version>& v) {
-            auto m = boost::accumulate(v, mutation(v.front().par->mut().key(*schema), schema), [this, schema] (mutation& m, const version& ver) {
+            auto it = boost::range::find_if(v, [] (auto&& ver) {
+                    return bool(ver.par);
+            });
+            auto m = boost::accumulate(v, mutation(it->par->mut().key(*schema), schema), [this, schema] (mutation& m, const version& ver) {
                if (ver.par) {
                    m.partition().apply(*schema, ver.par->mut().partition(), *schema);
                }
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -301,6 +301,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
    app_states.emplace(gms::application_state::RELEASE_VERSION, value_factory.release_version());
    app_states.emplace(gms::application_state::SUPPORTED_FEATURES, value_factory.supported_features(features));
    app_states.emplace(gms::application_state::CACHE_HITRATES, value_factory.cache_hitrates(""));
+    app_states.emplace(gms::application_state::SCHEMA_TABLES_VERSION, versioned_value(db::schema_tables::version));
    slogger.info("Starting up server gossip");

    auto& gossiper = gms::get_local_gossiper();
@@ -2269,7 +2270,7 @@ void storage_service::flush_column_families() {
        auto& local_db = ss.db().local();
        auto non_system_cfs = local_db.get_column_families() | boost::adaptors::filtered([] (auto& uuid_and_cf) {
            auto cf = uuid_and_cf.second;
-            return cf->schema()->ks_name() != db::system_keyspace::NAME;
+            return !is_system_keyspace(cf->schema()->ks_name());
        });
        // count CFs first
        auto total_cfs = boost::distance(non_system_cfs);
@@ -2289,7 +2290,7 @@ void storage_service::flush_column_families() {
        auto& local_db = ss.db().local();
        auto system_cfs = local_db.get_column_families() | boost::adaptors::filtered([] (auto& uuid_and_cf) {
            auto cf = uuid_and_cf.second;
-            return cf->schema()->ks_name() == db::system_keyspace::NAME;
+            return is_system_keyspace(cf->schema()->ks_name());
        });
        return parallel_for_each(system_cfs, [&ss] (auto&& uuid_and_cf) {
            auto cf = uuid_and_cf.second;
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -174,12 +174,14 @@ protected:
    uint64_t _estimated_partitions = 0;
    std::vector<unsigned long> _ancestors;
    db::replay_position _rp;
+    seastar::thread_scheduling_group* _tsg;
 protected:
-    compaction(column_family& cf, std::vector<shared_sstable> sstables, uint64_t max_sstable_size, uint32_t sstable_level)
+    compaction(column_family& cf, std::vector<shared_sstable> sstables, uint64_t max_sstable_size, uint32_t sstable_level, seastar::thread_scheduling_group* tsg)
        : _cf(cf)
        , _sstables(std::move(sstables))
        , _max_sstable_size(max_sstable_size)
        , _sstable_level(sstable_level)
+        , _tsg(tsg)
    {
        _cf.get_compaction_manager().register_compaction(_info);
    }
@@ -211,6 +213,12 @@ public:
    virtual ~compaction() {
        _cf.get_compaction_manager().deregister_compaction(_info);
    }
+
+    seastar::thread_attributes thread_attributes() {
+        seastar::thread_attributes attr;
+        attr.scheduling_group = _tsg;
+        return attr;
+    }
 private:
    ::mutation_reader setup() {
        std::vector<::mutation_reader> readers;
@@ -339,8 +347,8 @@ class regular_compaction : public compaction {
    stdx::optional<sstable_writer> _writer;
 public:
    regular_compaction(column_family& cf, std::vector<shared_sstable> sstables, std::function<shared_sstable()> creator,
-            uint64_t max_sstable_size, uint32_t sstable_level)
-        : compaction(cf, std::move(sstables), max_sstable_size, sstable_level)
+            uint64_t max_sstable_size, uint32_t sstable_level, seastar::thread_scheduling_group* tsg)
+        : compaction(cf, std::move(sstables), max_sstable_size, sstable_level, tsg)
        , _creator(std::move(creator))
        , _set(cf.get_sstable_set())
        , _selector(_set.make_incremental_selector())
@@ -407,8 +415,8 @@ public:
 class cleanup_compaction final : public regular_compaction {
 public:
    cleanup_compaction(column_family& cf, std::vector<shared_sstable> sstables, std::function<shared_sstable()> creator,
-            uint64_t max_sstable_size, uint32_t sstable_level)
-        : regular_compaction(cf, std::move(sstables), std::move(creator), max_sstable_size, sstable_level)
+            uint64_t max_sstable_size, uint32_t sstable_level, seastar::thread_scheduling_group* tsg)
+        : regular_compaction(cf, std::move(sstables), std::move(creator), max_sstable_size, sstable_level, tsg)
    {
        _info->type = compaction_type::Cleanup;
    }
@@ -444,8 +452,8 @@ class resharding_compaction final : public compaction {
    std::function<shared_sstable(shard_id)> _sstable_creator;
 public:
    resharding_compaction(std::vector<shared_sstable> sstables, column_family& cf, std::function<shared_sstable(shard_id)> creator,
-            uint64_t max_sstable_size, uint32_t sstable_level)
-        : compaction(cf, std::move(sstables), max_sstable_size, sstable_level)
+            uint64_t max_sstable_size, uint32_t sstable_level, seastar::thread_scheduling_group* tsg)
+        : compaction(cf, std::move(sstables), max_sstable_size, sstable_level, tsg)
        , _output_sstables(smp::count)
        , _sstable_creator(std::move(creator))
    {
@@ -494,7 +502,8 @@ public:
 };

 future<std::vector<shared_sstable>> compaction::run(std::unique_ptr<compaction> c) {
-    return seastar::async([c = std::move(c)] () mutable {
+    auto attr = c->thread_attributes();
+    return seastar::async(std::move(attr), [c = std::move(c)] () mutable {
        auto reader = c->setup();

        auto cr = c->get_compacting_sstable_writer();
@@ -527,21 +536,21 @@ static std::unique_ptr<compaction> make_compaction(bool cleanup, Params&&... par

 future<std::vector<shared_sstable>>
 compact_sstables(std::vector<shared_sstable> sstables, column_family& cf, std::function<shared_sstable()> creator,
-        uint64_t max_sstable_size, uint32_t sstable_level, bool cleanup) {
+        uint64_t max_sstable_size, uint32_t sstable_level, bool cleanup, seastar::thread_scheduling_group *tsg) {
    if (sstables.empty()) {
        throw std::runtime_error(sprint("Called compaction with empty set on behalf of {}.{}", cf.schema()->ks_name(), cf.schema()->cf_name()));
    }
-    auto c = make_compaction(cleanup, cf, std::move(sstables), std::move(creator), max_sstable_size, sstable_level);
+    auto c = make_compaction(cleanup, cf, std::move(sstables), std::move(creator), max_sstable_size, sstable_level, tsg);
    return compaction::run(std::move(c));
 }

 future<std::vector<shared_sstable>>
 reshard_sstables(std::vector<shared_sstable> sstables, column_family& cf, std::function<shared_sstable(shard_id)> creator,
-        uint64_t max_sstable_size, uint32_t sstable_level) {
+        uint64_t max_sstable_size, uint32_t sstable_level, seastar::thread_scheduling_group* tsg) {
    if (sstables.empty()) {
        throw std::runtime_error(sprint("Called resharding with empty set on behalf of {}.{}", cf.schema()->ks_name(), cf.schema()->cf_name()));
    }
-    auto c = std::make_unique<resharding_compaction>(std::move(sstables), cf, std::move(creator), max_sstable_size, sstable_level);
+    auto c = std::make_unique<resharding_compaction>(std::move(sstables), cf, std::move(creator), max_sstable_size, sstable_level, tsg);
    return compaction::run(std::move(c));
 }

--- a/sstables/compaction.hh
+++ b/sstables/compaction.hh
@@ -112,13 +112,15 @@ namespace sstables {
    // cleaning operation, and compaction history will not be updated.
    future<std::vector<shared_sstable>> compact_sstables(std::vector<shared_sstable> sstables,
            column_family& cf, std::function<shared_sstable()> creator,
-            uint64_t max_sstable_size, uint32_t sstable_level, bool cleanup = false);
+            uint64_t max_sstable_size, uint32_t sstable_level, bool cleanup = false,
+            seastar::thread_scheduling_group* tsg = nullptr);

    // Compacts a set of N shared sstables into M sstables. For every shard involved,
    // i.e. which owns any of the sstables, a new unshared sstable is created.
    future<std::vector<shared_sstable>> reshard_sstables(std::vector<shared_sstable> sstables,
            column_family& cf, std::function<shared_sstable(shard_id)> creator,
-        uint64_t max_sstable_size, uint32_t sstable_level);
+        uint64_t max_sstable_size, uint32_t sstable_level,
+        seastar::thread_scheduling_group* tsg = nullptr);

    // Return the most interesting bucket applying the size-tiered strategy.
    std::vector<sstables::shared_sstable>
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -148,19 +148,19 @@ public:
    index_comparator(const schema& s) : _tri_cmp(s) {}

    bool operator()(const summary_entry& e, dht::ring_position_view rp) const {
-        return _tri_cmp(e.get_key(), rp) < 0;
+        return _tri_cmp(e.get_decorated_key(), rp) < 0;
    }

    bool operator()(const index_entry& e, dht::ring_position_view rp) const {
-        return _tri_cmp(e.get_key(), rp) < 0;
+        return _tri_cmp(e.get_decorated_key(), rp) < 0;
    }

    bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
-        return _tri_cmp(e.get_key(), rp) > 0;
+        return _tri_cmp(e.get_decorated_key(), rp) > 0;
    }

    bool operator()(dht::ring_position_view rp, const index_entry& e) const {
-        return _tri_cmp(e.get_key(), rp) > 0;
+        return _tri_cmp(e.get_decorated_key(), rp) > 0;
    }
 };

@@ -497,8 +497,8 @@ public:
                return make_ready_future<bool>(false);
            }
            return read_partition_data().then([this, key] {
-                dht::ring_position_comparator cmp(*_sstable->_schema);
-                return cmp(key, partition_key()) == 0;
+                index_comparator cmp(*_sstable->_schema);
+                return cmp(key, current_partition_entry()) == 0;
            });
        });
    }
--- a/sstables/key.hh
+++ b/sstables/key.hh
@@ -26,6 +26,7 @@
 #include "database_fwd.hh"
 #include "keys.hh"
 #include "compound_compat.hh"
+#include "dht/i_partitioner.hh"

 namespace sstables {

@@ -35,12 +36,12 @@ public:
    explicit key_view(bytes_view b) : _bytes(b) {}
    key_view() : _bytes() {}

-    std::vector<bytes> explode(const schema& s) const {
+    std::vector<bytes_view> explode(const schema& s) const {
        return composite_view(_bytes, s.partition_key_size() > 1).explode();
    }

    partition_key to_partition_key(const schema& s) const {
-        return partition_key::from_exploded(s, explode(s));
+        return partition_key::from_exploded_view(explode(s));
    }

    bool operator==(const key_view& k) const { return k._bytes == _bytes; }
@@ -105,10 +106,10 @@ public:
        return make_key(s, pk);
    }
    partition_key to_partition_key(const schema& s) const {
-        return partition_key::from_exploded(s, explode(s));
+        return partition_key::from_exploded_view(explode(s));
    }

-    std::vector<bytes> explode(const schema& s) const {
+    std::vector<bytes_view> explode(const schema& s) const {
        return composite_view(_bytes, is_compound(s)).explode();
    }

@@ -142,4 +143,20 @@ inline key maximum_key() {
    return key(key::kind::after_all_keys);
 };

+class decorated_key_view {
+    const dht::token& _token;
+    key_view _partition_key;
+public:
+    decorated_key_view(const dht::token& token, key_view partition_key) noexcept
+        : _token(token), _partition_key(partition_key) { }
+
+    const dht::token& token() const {
+        return _token;
+    }
+
+    key_view key() const {
+        return _partition_key;
+    }
+};
+
 }
--- a/sstables/partition.cc
+++ b/sstables/partition.cc
@@ -37,10 +37,10 @@

 namespace sstables {

-static inline bytes pop_back(std::vector<bytes>& vec) {
+static inline bytes_view pop_back(std::vector<bytes_view>& vec) {
    auto b = std::move(vec.back());
    vec.pop_back();
-    return std::move(b);
+    return b;
 }

 class sstable_streamed_mutation;
@@ -104,11 +104,11 @@ public:
    struct column {
        bool is_static;
        bytes_view col_name;
-        std::vector<bytes> clustering;
+        std::vector<bytes_view> clustering;
        // see is_collection. collections have an extra element aside from the name.
        // This will be non-zero size if this is a collection, and zero size othersize.
-        bytes collection_extra_data;
-        bytes cell;
+        bytes_view collection_extra_data;
+        bytes_view cell;
        const column_definition *cdef;
        bool is_present;

@@ -148,7 +148,7 @@ public:
            return col;
        }

-        std::vector<bytes> extract_clustering_key(const schema& schema) {
+        std::vector<bytes_view> extract_clustering_key(const schema& schema) {
            return composite_view(col_name, schema.is_compound()).explode();
        }
        column(const schema& schema, bytes_view col, api::timestamp_type timestamp)
@@ -157,7 +157,7 @@ public:
            , clustering(extract_clustering_key(schema))
            , collection_extra_data(is_collection(schema) ? pop_back(clustering) : bytes()) // collections are not supported with COMPACT STORAGE, so this is fine
            , cell(!schema.is_dense() ? pop_back(clustering) : (*(schema.regular_begin())).name()) // dense: cell name is not provided. It is the only regular column
-            , cdef(schema.get_column_definition(cell))
+            , cdef(schema.get_column_definition(to_bytes(cell)))
            , is_present(cdef && timestamp > cdef->dropped_at())
        {

@@ -168,12 +168,6 @@ public:
                    }
                }
            }
-            // See schema::prepare_dense_schema. We can, using v3 schemas, have columns we consider "static" without
-            // the table being compound, i.e. no clustering. We can ignore prefixes, but we still need to produce
-            // static for mutations.
-            if (cdef && !is_static && cdef->is_static() && schema.is_static_compact_table()) {
-                is_static = true;
-            }
            if (is_present && is_static != cdef->is_static()) {
                throw malformed_sstable_exception(seastar::format("Mismatch between {} cell and {} column definition",
                        is_static ? "static" : "non-static", cdef->is_static() ? "static" : "non-static"));
@@ -227,7 +221,7 @@ private:
        if (!_pending_collection || _pending_collection->is_new_collection(cdef)) {
            flush_pending_collection(*_schema);

-            if (!cdef->type->is_multi_cell()) {
+            if (!cdef->is_multi_cell()) {
                throw malformed_sstable_exception("frozen set should behave like a cell\n");
            }
            _pending_collection = collection_mutation(cdef);
@@ -403,12 +397,12 @@ public:
        return ret;
    }

-    proceed flush_if_needed(bool is_static, const exploded_clustering_prefix& ecp) {
+    proceed flush_if_needed(bool is_static, const std::vector<bytes_view>& ecp) {
        auto pos = [&] {
            if (is_static) {
                return position_in_partition(position_in_partition::static_row_tag_t());
            } else {
-                auto ck = clustering_key_prefix::from_clustering_prefix(*_schema, ecp);
+                auto ck = clustering_key_prefix::from_exploded_view(ecp);
                return position_in_partition(position_in_partition::clustering_row_tag_t(), std::move(ck));
            }
        }();
@@ -460,8 +454,7 @@ public:

        struct column col(*_schema, col_name, timestamp);

-        auto clustering_prefix = exploded_clustering_prefix(std::move(col.clustering));
-        auto ret = flush_if_needed(col.is_static, clustering_prefix);
+        auto ret = flush_if_needed(col.is_static, col.clustering);
        if (_skip_in_progress) {
            return ret;
        }
@@ -506,11 +499,11 @@ public:
            auto ac = make_atomic_cell(timestamp, value, ttl, expiration);

            bool is_multi_cell = col.collection_extra_data.size();
-            if (is_multi_cell != col.cdef->type->is_multi_cell()) {
+            if (is_multi_cell != col.cdef->is_multi_cell()) {
                return;
            }
            if (is_multi_cell) {
-                update_pending_collection(col.cdef, std::move(col.collection_extra_data), std::move(ac));
+                update_pending_collection(col.cdef, to_bytes(col.collection_extra_data), std::move(ac));
                return;
            }

@@ -535,8 +528,7 @@ public:
    }

    proceed consume_deleted_cell(column &col, int64_t timestamp, gc_clock::time_point ttl) {
-        auto clustering_prefix = exploded_clustering_prefix(std::move(col.clustering));
-        auto ret = flush_if_needed(col.is_static, clustering_prefix);
+        auto ret = flush_if_needed(col.is_static, col.clustering);
        if (_skip_in_progress) {
            return ret;
        }
@@ -553,12 +545,12 @@ public:
        auto ac = atomic_cell::make_dead(timestamp, ttl);

        bool is_multi_cell = col.collection_extra_data.size();
-        if (is_multi_cell != col.cdef->type->is_multi_cell()) {
+        if (is_multi_cell != col.cdef->is_multi_cell()) {
            return ret;
        }

        if (is_multi_cell) {
-            update_pending_collection(col.cdef, std::move(col.collection_extra_data), std::move(ac));
+            update_pending_collection(col.cdef, to_bytes(col.collection_extra_data), std::move(ac));
        } else if (col.is_static) {
            _in_progress->as_mutable_static_row().set_cell(*col.cdef, atomic_cell_or_collection(std::move(ac)));
        } else {
@@ -580,7 +572,7 @@ public:
            return proceed::yes;
        }
        auto key = composite_view(column::fix_static_name(*_schema, col_name)).explode();
-        auto ck = clustering_key_prefix::from_exploded(std::move(key));
+        auto ck = clustering_key_prefix::from_exploded_view(key);
        auto ret = flush_if_needed(std::move(ck));
        if (!_skip_in_progress) {
            _in_progress->as_mutable_clustering_row().apply(shadowable_tombstone(tombstone(deltime)));
@@ -636,9 +628,9 @@ public:
        // Still, it is enough to check if we're dealing with a collection, since any other tombstone
        // won't have a full clustering prefix (otherwise it isn't a range)
        if (start.size() <= _schema->clustering_key_size()) {
-            auto start_ck = clustering_key_prefix::from_exploded(std::move(start));
+            auto start_ck = clustering_key_prefix::from_exploded_view(start);
            auto start_kind = start_marker_to_bound_kind(start_col);
-            auto end = clustering_key_prefix::from_exploded(composite_view(column::fix_static_name(*_schema, end_col)).explode());
+            auto end = clustering_key_prefix::from_exploded_view(composite_view(column::fix_static_name(*_schema, end_col)).explode());
            auto end_kind = end_marker_to_bound_kind(end_col);
            if (range_tombstone::is_single_clustering_row_tombstone(*_schema, start_ck, start_kind, end, end_kind)) {
                auto ret = flush_if_needed(std::move(start_ck));
@@ -664,9 +656,9 @@ public:
            }
        } else {
            auto&& column = pop_back(start);
-            auto cdef = _schema->get_column_definition(column);
-            if (cdef && cdef->type->is_multi_cell() && deltime.marked_for_delete_at > cdef->dropped_at()) {
-                auto ret = flush_if_needed(cdef->is_static(), exploded_clustering_prefix(std::move(start)));
+            auto cdef = _schema->get_column_definition(to_bytes(column));
+            if (cdef && cdef->is_multi_cell() && deltime.marked_for_delete_at > cdef->dropped_at()) {
+                auto ret = flush_if_needed(cdef->is_static(), start);
                if (!_skip_in_progress) {
                    update_pending_collection(cdef, tombstone(deltime));
                }
@@ -841,12 +833,14 @@ public:
    sstable_streamed_mutation(sstable_streamed_mutation&&) = delete;

    virtual future<> fill_buffer() final override {
-        _ds->_consumer.push_ready_fragments();
-        if (is_buffer_full() || is_end_of_stream()) {
-            return make_ready_future<>();
-        }
-        return _ds->_consumer.maybe_skip().then([this] {
-            return _ds->_context.read();
+        return do_until([this] { return !is_buffer_empty() || is_end_of_stream(); }, [this] {
+            _ds->_consumer.push_ready_fragments();
+            if (is_buffer_full() || is_end_of_stream()) {
+                return make_ready_future<>();
+            }
+            return _ds->_consumer.maybe_skip().then([this] {
+                return _ds->_context.read();
+            });
        });
    }

--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -649,6 +649,7 @@ future<> parse(random_access_reader& in, summary& s) {
                    buf.trim_front(keysize);
                    // FIXME: This is a le read. We should make this explicit
                    entry.position = *(reinterpret_cast<const net::packed<uint64_t> *>(buf.get()));
+                    entry.token = dht::global_partitioner().get_token(entry.get_key());

                    return make_ready_future<>();
                });
@@ -987,7 +988,7 @@ future<> sstable::read_simple(T& component, const io_priority_class& pc) {
            auto f = make_checked_file(_read_error_handler, fi);
            auto r = make_lw_shared<file_random_access_reader>(std::move(f), size, sstable_buffer_size);
            auto fut = parse(*r, component);
-            return fut.finally([r = std::move(r)] {
+            return fut.finally([r] {
                return r->close();
            }).then([r] {});
        });
@@ -1266,6 +1267,18 @@ future<foreign_sstable_open_info> sstable::get_open_info() & {
    });
 }

+static composite::eoc bound_kind_to_start_marker(bound_kind start_kind) {
+    return start_kind == bound_kind::excl_start
+         ? composite::eoc::end
+         : composite::eoc::start;
+}
+
+static composite::eoc bound_kind_to_end_marker(bound_kind end_kind) {
+    return end_kind == bound_kind::excl_end
+         ? composite::eoc::start
+         : composite::eoc::end;
+}
+
 static void output_promoted_index_entry(bytes_ostream& promoted_index,
        const bytes& first_col,
        const bytes& last_col,
@@ -1322,8 +1335,9 @@ static bytes serialize_colname(const composite& clustering_key,
 // (which might be gone later).
 void sstable::maybe_flush_pi_block(file_writer& out,
        const composite& clustering_key,
-        const std::vector<bytes_view>& column_names) {
-    bytes colname = serialize_colname(clustering_key, column_names, composite::eoc::none);
+        const std::vector<bytes_view>& column_names,
+        composite::eoc marker) {
+    bytes colname = serialize_colname(clustering_key, column_names, marker);
    if (_pi_write.block_first_colname.empty()) {
        // This is the first column in the partition, or first column since we
        // closed a promoted-index block. Remember its name and position -
@@ -1355,7 +1369,9 @@ void sstable::maybe_flush_pi_block(file_writer& out,
                auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
                auto end = composite::from_clustering_element(*_pi_write.schemap, rt.end);
                write_range_tombstone(out,
-                        start, rt.start_kind, end, rt.end_kind, {}, rt.tomb);
+                        start, bound_kind_to_start_marker(rt.start_kind),
+                        end, bound_kind_to_end_marker(rt.end_kind),
+                        {}, rt.tomb);
            }
        }
        _pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
@@ -1533,24 +1549,18 @@ void sstable::write_row_tombstone(file_writer& out, const composite& key, const

 void sstable::write_range_tombstone(file_writer& out,
        const composite& start,
-        bound_kind start_kind,
+        composite::eoc start_marker,
        const composite& end,
-        bound_kind end_kind,
+        composite::eoc end_marker,
        std::vector<bytes_view> suffix,
        const tombstone t) {
    if (!t) {
        return;
    }

-    auto start_marker = start_kind == bound_kind::excl_start
-                      ? composite::eoc::end
-                      : composite::eoc::start;
    write_column_name(out, start, suffix, start_marker);
    column_mask mask = column_mask::range_tombstone;
    write(out, mask);
-    auto end_marker = end_kind == bound_kind::excl_end
-                    ? composite::eoc::start
-                    : composite::eoc::end;
    write_column_name(out, end, suffix, end_marker);
    write_deletion_time(out, t);
 }
@@ -1721,10 +1731,10 @@ static void prepare_compression(compression& c, const schema& schema) {
    c.init_full_checksum();
 }

-static void maybe_add_summary_entry(summary& s, bytes_view key, uint64_t offset) {
+static void maybe_add_summary_entry(summary& s, const dht::token& token,  bytes_view key, uint64_t offset) {
    // Maybe add summary entry into in-memory representation of summary file.
    if ((s.keys_written++ % s.header.min_index_interval) == 0) {
-        s.entries.push_back({ bytes(key.data(), key.size()), offset });
+        s.entries.push_back({ token, bytes(key.data(), key.size()), offset });
    }
 }

@@ -1830,6 +1840,7 @@ components_writer::components_writer(sstable& sst, const schema& s, file_writer&
    , _schema(s)
    , _out(out)
    , _index(index_file_writer(sst, pc))
+    , _index_needs_close(true)
    , _max_sstable_size(cfg.max_sstable_size)
    , _tombstone_written(false)
 {
@@ -1847,7 +1858,7 @@ void components_writer::consume_new_partition(const dht::decorated_key& dk) {

    _partition_key = key::from_partition_key(_schema, dk.key());

-    maybe_add_summary_entry(_sst._components->summary, bytes_view(*_partition_key), _index.offset());
+    maybe_add_summary_entry(_sst._components->summary, dk.token(), bytes_view(*_partition_key), _index.offset());
    _sst._components->filter->add(bytes_view(*_partition_key));
    _sst._collector.add_key(bytes_view(*_partition_key));

@@ -1915,9 +1926,11 @@ stop_iteration components_writer::consume(range_tombstone&& rt) {
    // already closed by rt.start, so the accumulator doesn't grow boundless.
    _sst._pi_write.tombstone_accumulator->apply(rt);
    auto start = composite::from_clustering_element(_schema, std::move(rt.start));
+    auto start_marker = bound_kind_to_start_marker(rt.start_kind);
    auto end = composite::from_clustering_element(_schema, std::move(rt.end));
-    _sst.maybe_flush_pi_block(_out, start, {});
-    _sst.write_range_tombstone(_out, std::move(start), rt.start_kind, std::move(end), rt.end_kind, {}, rt.tomb);
+    auto end_marker = bound_kind_to_end_marker(rt.end_kind);
+    _sst.maybe_flush_pi_block(_out, start, {}, start_marker);
+    _sst.write_range_tombstone(_out, std::move(start), start_marker, std::move(end), end_marker, {}, rt.tomb);
    return stop_iteration::no;
 }

@@ -1959,6 +1972,7 @@ stop_iteration components_writer::consume_end_of_partition() {
 void components_writer::consume_end_of_stream() {
    seal_summary(_sst._components->summary, std::move(_first_key), std::move(_last_key)); // what if there is only one partition? what if it is empty?

+    _index_needs_close = false;
    _index.close().get();

    if (_sst.has_component(sstable::component_type::CompressionInfo)) {
@@ -1970,6 +1984,16 @@ void components_writer::consume_end_of_stream() {
            _sst._schema, _sst.get_first_decorated_key(), _sst.get_last_decorated_key());
 }

+components_writer::~components_writer() {
+    if (_index_needs_close) {
+        try {
+            _index.close().get();
+        } catch (...) {
+            sstlog.error("components_writer failed to close file: {}", std::current_exception());
+        }
+    }
+}
+
 future<>
 sstable::read_scylla_metadata(const io_priority_class& pc) {
    if (_components->scylla_metadata) {
@@ -2090,7 +2114,9 @@ future<> sstable::write_components(::mutation_reader mr,
    if (cfg.replay_position) {
        _collector.set_replay_position(cfg.replay_position.value());
    }
-    return seastar::async([this, mr = std::move(mr), estimated_partitions, schema = std::move(schema), cfg, &pc] () mutable {
+    seastar::thread_attributes attr;
+    attr.scheduling_group = cfg.thread_scheduling_group;
+    return seastar::async(std::move(attr), [this, mr = std::move(mr), estimated_partitions, schema = std::move(schema), cfg, &pc] () mutable {
        auto wr = get_writer(*schema, estimated_partitions, cfg, pc);
        consume_flattened_in_thread(mr, wr);
    });
@@ -2112,7 +2138,8 @@ future<> sstable::generate_summary(const io_priority_class& pc) {
            return true;
        }
        void consume_entry(index_entry&& ie, uint64_t offset) {
-            maybe_add_summary_entry(_summary, ie.get_key_bytes(), offset);
+            auto token = dht::global_partitioner().get_token(ie.get_key());
+            maybe_add_summary_entry(_summary, token, ie.get_key_bytes(), offset);
            if (!first_key) {
                first_key = key(to_bytes(ie.get_key_bytes()));
            } else {
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -53,6 +53,10 @@
 #include "sstables/shared_index_lists.hh"
 #include "db/commitlog/replay_position.hh"

+namespace seastar {
+class thread_scheduling_group;
+}
+
 namespace sstables {

 extern logging::logger sstlog;
@@ -131,6 +135,7 @@ struct sstable_writer_config {
    bool backup = false;
    bool leave_unsealed = false;
    stdx::optional<db::replay_position> replay_position;
+    seastar::thread_scheduling_group* thread_scheduling_group = nullptr;
 };

 class sstable : public enable_lw_shared_from_this<sstable> {
@@ -492,7 +497,8 @@ private:

    void maybe_flush_pi_block(file_writer& out,
            const composite& clustering_key,
-            const std::vector<bytes_view>& column_names);
+            const std::vector<bytes_view>& column_names,
+            composite::eoc marker = composite::eoc::none);

    schema_ptr _schema;
    sstring _dir;
@@ -597,9 +603,9 @@ private:
    void write_cell(file_writer& out, atomic_cell_view cell, const column_definition& cdef);
    void write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none);
    void write_column_name(file_writer& out, bytes_view column_names);
-    void write_range_tombstone(file_writer& out, const composite& start, bound_kind start_kind, const composite& end, bound_kind stop_kind, std::vector<bytes_view> suffix, const tombstone t);
+    void write_range_tombstone(file_writer& out, const composite& start, composite::eoc start_marker, const composite& end, composite::eoc end_marker, std::vector<bytes_view> suffix, const tombstone t);
    void write_range_tombstone(file_writer& out, const composite& start, const composite& end, std::vector<bytes_view> suffix, const tombstone t) {
-        write_range_tombstone(out, start, bound_kind::incl_start, end, bound_kind::incl_end, std::move(suffix), std::move(t));
+        write_range_tombstone(out, start, composite::eoc::start, end, composite::eoc::end, std::move(suffix), std::move(t));
    }
    void write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation_view collection);
    void write_row_tombstone(file_writer& out, const composite& key, const row_tombstone t);
@@ -766,6 +772,7 @@ class components_writer {
    const schema& _schema;
    file_writer& _out;
    file_writer _index;
+    bool _index_needs_close;
    uint64_t _max_sstable_size;
    bool _tombstone_written;
    // Remember first and last keys, which we need for the summary file.
@@ -781,6 +788,12 @@ private:
    }
 public:
    components_writer(sstable& sst, const schema& s, file_writer& out, uint64_t estimated_partitions, const sstable_writer_config&, const io_priority_class& pc);
+    ~components_writer();
+    components_writer(components_writer&& o) : _sst(o._sst), _schema(o._schema), _out(o._out), _index(std::move(o._index)),
+            _index_needs_close(o._index_needs_close), _max_sstable_size(o._max_sstable_size), _tombstone_written(o._tombstone_written),
+            _first_key(std::move(o._first_key)), _last_key(std::move(o._last_key)), _partition_key(std::move(o._partition_key)) {
+        o._index_needs_close = false;
+    }

    void consume_new_partition(const dht::decorated_key& dk);
    void consume(tombstone t);
--- a/sstables/types.hh
+++ b/sstables/types.hh
@@ -118,6 +118,7 @@ public:

 class index_entry {
    temporary_buffer<char> _key;
+    mutable stdx::optional<dht::token> _token;
    uint64_t _position;
    temporary_buffer<char> _promoted_index_bytes;
    stdx::optional<promoted_index> _promoted_index;
@@ -131,6 +132,13 @@ public:
        return key_view{get_key_bytes()};
    }

+    decorated_key_view get_decorated_key() const {
+        if (!_token) {
+            _token.emplace(dht::global_partitioner().get_token(get_key()));
+        }
+        return decorated_key_view(*_token, get_key());
+    }
+
    uint64_t position() const {
        return _position;
    }
@@ -164,6 +172,7 @@ public:
 };

 struct summary_entry {
+    dht::token token;
    bytes key;
    uint64_t position;

@@ -171,6 +180,10 @@ struct summary_entry {
        return key_view{key};
    }

+    decorated_key_view get_decorated_key() const {
+        return decorated_key_view(token, get_key());
+    }
+
    bool operator==(const summary_entry& x) const {
        return position ==  x.position && key == x.key;
    }
--- a/streamed_mutation.cc
+++ b/streamed_mutation.cc
@@ -466,7 +466,10 @@ protected:
            std::vector<future<>> more_data;
            for (auto& rd : _next_readers) {
                if (rd->is_buffer_empty() && !rd->is_end_of_stream()) {
-                    more_data.emplace_back(rd->fill_buffer());
+                    auto f = rd->fill_buffer();
+                    if (!f.available() || f.failed()) {
+                        more_data.emplace_back(std::move(f));
+                    }
                }
            }
            if (!more_data.empty()) {
--- a/streamed_mutation.hh
+++ b/streamed_mutation.hh
@@ -225,6 +225,7 @@ public:
        }
        return *this;
    }
+    [[gnu::always_inline]]
    ~mutation_fragment() {
        if (_data) {
            destroy_data();
@@ -717,6 +718,30 @@ public:
    }
 };

+// Consumes mutation fragments until StopCondition is true.
+// The consumer will stop iff StopCondition returns true, in particular
+// reaching the end of stream alone won't stop the reader.
+template<typename StopCondition, typename ConsumeMutationFragment, typename ConsumeEndOfStream>
+GCC6_CONCEPT(requires requires(StopCondition stop, ConsumeMutationFragment consume_mf, ConsumeEndOfStream consume_eos, mutation_fragment mf) {
+    { stop() } -> bool;
+    { consume_mf(std::move(mf)) } -> void;
+    { consume_eos() } -> future<>;
+})
+future<> consume_mutation_fragments_until(streamed_mutation& sm, StopCondition&& stop,
+                                          ConsumeMutationFragment&& consume_mf, ConsumeEndOfStream&& consume_eos) {
+    return do_until([stop] { return stop(); }, [&sm, stop, consume_mf, consume_eos] {
+        while (!sm.is_buffer_empty()) {
+            consume_mf(sm.pop_mutation_fragment());
+            if (stop()) {
+                return make_ready_future<>();
+            }
+        }
+        if (sm.is_end_of_stream()) {
+            return consume_eos();
+        }
+        return sm.fill_buffer();
+    });
+}

 GCC6_CONCEPT(
    // F gets a stream element as an argument and returns the new value which replaces that element
--- a/tests/commitlog_test.cc
+++ b/tests/commitlog_test.cc
@@ -213,6 +213,12 @@ SEASTAR_TEST_CASE(test_commitlog_discard_completed_segments){
                            BOOST_REQUIRE(nn <= names.size());
                            BOOST_REQUIRE(dn <= nn);
                        });
+                    }).then([&log] {
+                        return log.shutdown().then([&log] {
+                            return log.list_existing_segments().then([] (auto descs) {
+                                BOOST_REQUIRE(descs.empty());
+                            });
+                        });
                    });
        });
 }
--- a/tests/compound_test.cc
+++ b/tests/compound_test.cc
@@ -296,13 +296,18 @@ BOOST_AUTO_TEST_CASE(test_composite_from_exploded) {
 }

 BOOST_AUTO_TEST_CASE(test_composite_view_explode) {
+    auto to_owning_vector = [] (std::vector<bytes_view> bvs) {
+        return boost::copy_range<std::vector<bytes>>(bvs | boost::adaptors::transformed([] (auto bv) {
+            return bytes(bv.begin(), bv.end());
+        }));
+    };
    {
-        BOOST_REQUIRE_EQUAL(composite_view(composite(bytes({'\x00', '\x03', 'e', 'l', '1', '\x00'}))).explode(),
+        BOOST_REQUIRE_EQUAL(to_owning_vector(composite_view(composite(bytes({'\x00', '\x03', 'e', 'l', '1', '\x00'}))).explode()),
                            std::vector<bytes>({bytes({'e', 'l', '1'})}));
    }

    {
-        BOOST_REQUIRE_EQUAL(composite_view(composite(bytes({'e', 'l', '1'}), false)).explode(),
+        BOOST_REQUIRE_EQUAL(to_owning_vector(composite_view(composite(bytes({'e', 'l', '1'}), false)).explode()),
                            std::vector<bytes>({bytes({'e', 'l', '1'})}));
    }
 }
--- a/tests/cql_query_test.cc
+++ b/tests/cql_query_test.cc
@@ -1960,27 +1960,11 @@ SEASTAR_TEST_CASE(test_compact_storage) {
            });
            return e.execute_cql("create table tcs4 (p1 int PRIMARY KEY, c1 int, c2 int) with compact storage;").discard_result();
        }).then([&e] {
-            // from v3 style schema, tcs4 will be given synthehic clustering, and insert of only PK is not
-            // legal. Must include a value.
-            return make_ready_future().then([&e] {
-                return e.execute_cql("insert into tcs4 (p1) values (1);");
-            }).then_wrapped([](auto f) {
-                try {
-                    f.get();
-                    BOOST_FAIL("Should not reach");
-                } catch (...) {
-                    // ok
-                }
-                return make_ready_future();
-            });
-        }).then([&e] {
-            return e.execute_cql("insert into tcs4 (p1, c1) values (1, 2);").discard_result();
+            return e.execute_cql("insert into tcs4 (p1) values (1);").discard_result();
        }).then([&e] {
            return e.execute_cql("select * from tcs4;");
        }).then([&e] (auto msg) {
-            assert_that(msg).is_rows().with_rows({
-                { int32_type->decompose(1), int32_type->decompose(2), {} },
-            });
+            assert_that(msg).is_rows().with_rows({ });
        });
    });
 }
--- a/tests/mutation_assertions.hh
+++ b/tests/mutation_assertions.hh
@@ -31,9 +31,9 @@ public:
    { }

    // If ck_ranges is passed, verifies only that information relevant for ck_ranges matches.
-    mutation_assertion& is_equal_to(const mutation& other, const query::clustering_row_ranges& ck_ranges = {}) {
-        if (!ck_ranges.empty()) {
-            mutation_assertion(_m.sliced(ck_ranges)).is_equal_to(other.sliced(ck_ranges));
+    mutation_assertion& is_equal_to(const mutation& other, stdx::optional<query::clustering_row_ranges> ck_ranges = {}) {
+        if (ck_ranges) {
+            mutation_assertion(_m.sliced(*ck_ranges)).is_equal_to(other.sliced(*ck_ranges));
            return *this;
        }
        if (_m != other) {
--- a/tests/mutation_reader_assertions.hh
+++ b/tests/mutation_reader_assertions.hh
@@ -51,7 +51,7 @@ public:
        return *this;
    }

-    reader_assertions& produces(mutation m, const query::clustering_row_ranges& ck_ranges = {}) {
+    reader_assertions& produces(mutation m, stdx::optional<query::clustering_row_ranges> ck_ranges = {}) {
        BOOST_TEST_MESSAGE(sprint("Expecting %s", m));
        auto mo = read_next();
        BOOST_REQUIRE(bool(mo));
--- a/tests/partitioner_test.cc
+++ b/tests/partitioner_test.cc
@@ -925,8 +925,8 @@ BOOST_AUTO_TEST_CASE(test_split_range_single_shard) {

 // tests for range_split() utility function in repair/range_split.hh
 static int test_split(int N, int K) {
-    auto t1 = token_from_long(0x6000'0000'0000'0000);
-    auto t2 = token_from_long(0x9000'0000'0000'0000);
+    auto t1 = token_from_long(0x2000'0000'0000'0000);
+    auto t2 = token_from_long(0x5000'0000'0000'0000);
    dht::token_range r{range_bound<dht::token>(t1), range_bound<dht::token>(t2)};
    auto splitter = range_splitter(r, N, K);
    int c = 0;
@@ -955,3 +955,92 @@ BOOST_AUTO_TEST_CASE(test_split_1) {
    // 7.8125, so expect 2^7 = 128 ranges:
    BOOST_REQUIRE(test_split(1000, 11) == 128);
 }
+
+static
+void
+test_something_with_some_interesting_ranges_and_partitioners_with_token_range(std::function<void (const dht::i_partitioner&, const schema&, const dht::token_range&)> func_to_test) {
+    auto s = schema_builder("ks", "cf")
+        .with_column("c1", int32_type, column_kind::partition_key)
+        .with_column("c2", int32_type, column_kind::partition_key)
+        .with_column("v", int32_type)
+        .build();
+    auto some_murmur3_partitioners = {
+            dht::murmur3_partitioner(1, 0),
+            dht::murmur3_partitioner(7, 4),
+            dht::murmur3_partitioner(4, 0),
+            dht::murmur3_partitioner(32, 8),  // More, and we OOM since memory isn't configured
+    };
+    auto some_random_partitioners = {
+            dht::random_partitioner(1),
+            dht::random_partitioner(3),
+    };
+    auto t1 = token_from_long(int64_t(-0x7fff'ffff'ffff'fffe));
+    auto t2 = token_from_long(int64_t(-1));
+    auto t3 = token_from_long(int64_t(1));
+    auto t4 = token_from_long(int64_t(0x7fff'ffff'ffff'fffe));
+    auto make_bound = [] (dht::token t) {
+        return range_bound<dht::token>(std::move(t));
+    };
+    auto some_murmur3_ranges = {
+            dht::token_range::make_open_ended_both_sides(),
+            dht::token_range::make_starting_with(make_bound(t1)),
+            dht::token_range::make_starting_with(make_bound(t2)),
+            dht::token_range::make_starting_with(make_bound(t3)),
+            dht::token_range::make_starting_with(make_bound(t4)),
+            dht::token_range::make_ending_with(make_bound(t1)),
+            dht::token_range::make_ending_with(make_bound(t2)),
+            dht::token_range::make_ending_with(make_bound(t3)),
+            dht::token_range::make_ending_with(make_bound(t4)),
+            dht::token_range(make_bound(t2), make_bound(t3)),
+            dht::token_range(make_bound(t1), make_bound(t4)),
+    };
+    for (auto&& part : some_murmur3_partitioners) {
+        for (auto&& range : some_murmur3_ranges) {
+            func_to_test(part, *s, range);
+        }
+    }
+    for (auto&& part : some_random_partitioners) {
+        func_to_test(part, *s, dht::token_range::make_open_ended_both_sides());
+    }
+}
+
+static
+void
+do_test_selective_token_range_sharder(const dht::i_partitioner& part, const schema& s, const dht::token_range& range) {
+    dht::set_global_partitioner(part.name());
+    bool debug = false;
+    for (auto shard : boost::irange(0u, part.shard_count())) {
+        auto sharder = dht::selective_token_range_sharder(part, range, shard);
+        auto range_shard = sharder.next();
+        while (range_shard) {
+            if (range_shard->start() && range_shard->start()->is_inclusive()) {
+                auto start_shard = part.shard_of(range_shard->start()->value());
+                if (debug) {
+                    std::cout << " start_shard " << start_shard << " shard " << shard << " range " << range_shard << "\n";
+                }
+                BOOST_REQUIRE(start_shard == shard);
+            }
+            if (range_shard->end() && range_shard->end()->is_inclusive()) {
+                auto end_shard = part.shard_of(range_shard->end()->value());
+                if (debug) {
+                    std::cout << " end_shard " << end_shard << " shard " << shard << " range " << range_shard << "\n";
+                }
+                BOOST_REQUIRE(end_shard == shard);
+            }
+            auto midpoint = part.midpoint(
+                    range_shard->start() ? range_shard->start()->value() : dht::minimum_token(),
+                    range_shard->end() ? range_shard->end()->value() : dht::minimum_token());
+            auto mid_shard = part.shard_of(midpoint);
+            if (debug) {
+                std::cout << " mid " << mid_shard << " shard " << shard << " range " << range_shard << "\n";
+            }
+            BOOST_REQUIRE(mid_shard == shard);
+
+            range_shard = sharder.next();
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE(test_selective_token_range_sharder) {
+    return test_something_with_some_interesting_ranges_and_partitioners_with_token_range(do_test_selective_token_range_sharder);
+}
--- a/tests/perf/perf_fast_forward.cc
+++ b/tests/perf/perf_fast_forward.cc
@@ -102,9 +102,9 @@ struct test_result {
    uint64_t index_misses() const { return after.index.misses - before.index.misses; }
    uint64_t index_blocks() const { return after.index.blocks - before.index.blocks; }

-    uint64_t cache_hits() const { return after.cache.hits - before.cache.hits; }
-    uint64_t cache_misses() const { return after.cache.misses - before.cache.misses; }
-    uint64_t cache_insertions() const { return after.cache.insertions - before.cache.insertions; }
+    uint64_t cache_hits() const { return after.cache.partition_hits - before.cache.partition_hits; }
+    uint64_t cache_misses() const { return after.cache.partition_misses - before.cache.partition_misses; }
+    uint64_t cache_insertions() const { return after.cache.partition_insertions - before.cache.partition_insertions; }

    float cpu_utilization() const {
        auto busy_delta = after.busy_time.count() - before.busy_time.count();
--- a/tests/row_cache_alloc_stress.cc
+++ b/tests/row_cache_alloc_stress.cc
@@ -132,7 +132,7 @@ int main(int argc, char** argv) {
            auto fill_cache_to_the_top = [&] {
                std::cout << "Filling up memory with evictable data\n";
                while (true) {
-                    auto evictions_before = tracker.get_stats().evictions;
+                    auto evictions_before = tracker.get_stats().partition_evictions;
                    // Ensure that entries matching memtable partitions are evicted
                    // last, we want to hit the merge path in row_cache::update()
                    for (auto&& key : keys) {
@@ -141,7 +141,7 @@ int main(int argc, char** argv) {
                    auto m = make_small_mutation();
                    cache_stuffing.push_back(m.decorated_key());
                    cache.populate(m);
-                    if (tracker.get_stats().evictions > evictions_before) {
+                    if (tracker.get_stats().partition_evictions > evictions_before) {
                        break;
                    }
                }
--- a/tests/row_cache_test.cc
+++ b/tests/row_cache_test.cc
@@ -1024,17 +1024,6 @@ static std::vector<mutation> updated_ring(std::vector<mutation>& mutations) {
    return result;
 }

-static mutation_source make_mutation_source(std::vector<lw_shared_ptr<memtable>>& memtables) {
-    return mutation_source([&memtables] (schema_ptr s, const dht::partition_range& pr,
-            const query::partition_slice& slice, const io_priority_class& pc, tracing::trace_state_ptr trace, streamed_mutation::forwarding fwd) {
-        std::vector<mutation_reader> readers;
-        for (auto&& mt : memtables) {
-            readers.emplace_back(mt->make_reader(s, pr, slice, pc, trace, fwd));
-        }
-        return make_combined_reader(std::move(readers));
-    });
-}
-
 SEASTAR_TEST_CASE(test_continuity_flag_and_invalidate_race) {
    return seastar::async([] {
        auto s = make_schema();
@@ -1753,3 +1742,40 @@ SEASTAR_TEST_CASE(test_tombstone_merging_in_partial_partition) {
        }
    });
 }
+
+SEASTAR_TEST_CASE(test_query_only_static_row) {
+    return seastar::async([] {
+        simple_schema s;
+        auto cache_mt = make_lw_shared<memtable>(s.schema());
+
+        auto pkeys = s.make_pkeys(1);
+
+        mutation m1(pkeys[0], s.schema());
+        s.add_static_row(m1, "s1");
+        s.add_row(m1, s.make_ckey(0), "v1");
+        s.add_row(m1, s.make_ckey(1), "v2");
+        cache_mt->apply(m1);
+
+        cache_tracker tracker;
+        row_cache cache(s.schema(), snapshot_source_from_snapshot(cache_mt->as_data_source()), tracker);
+
+        // fully populate cache
+        {
+            auto prange = dht::partition_range::make_ending_with(dht::ring_position(m1.decorated_key()));
+            assert_that(cache.make_reader(s.schema(), prange, query::full_slice))
+                    .produces(m1)
+                    .produces_end_of_stream();
+        }
+
+        // query just a static row
+        {
+            auto slice = partition_slice_builder(*s.schema())
+                    .with_ranges({ })
+                    .build();
+            auto prange = dht::partition_range::make_ending_with(dht::ring_position(m1.decorated_key()));
+            assert_that(cache.make_reader(s.schema(), prange, slice))
+                    .produces(m1, slice.row_ranges(*s.schema(), m1.key()))
+                    .produces_end_of_stream();
+        }
+    });
+}
--- a/tests/schema_change_test.cc
+++ b/tests/schema_change_test.cc
@@ -26,6 +26,7 @@
 #include <seastar/util/defer.hh>

 #include "tests/cql_test_env.hh"
+#include "tests/cql_assertions.hh"
 #include "tests/mutation_source_test.hh"
 #include "tests/result_set_assertions.hh"
 #include "service/migration_manager.hh"
@@ -68,7 +69,6 @@ SEASTAR_TEST_CASE(test_schema_is_updated_in_keyspace) {
        return seastar::async([&] {
            auto builder = schema_builder("tests", "table")
                    .with_column("pk", bytes_type, column_kind::partition_key)
-                    .with_column("ck", bytes_type, column_kind::clustering_key)
                    .with_column("v1", bytes_type);

            e.execute_cql("create keyspace tests with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };").get();
@@ -130,6 +130,53 @@ SEASTAR_TEST_CASE(test_tombstones_are_ignored_in_version_calculation) {
    });
 }

+SEASTAR_TEST_CASE(test_concurrent_column_addition) {
+    return do_with_cql_env([](cql_test_env& e) {
+        return seastar::async([&] {
+            e.execute_cql("create keyspace tests with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };").get();
+
+            service::migration_manager& mm = service::get_local_migration_manager();
+
+            auto s0 = schema_builder("ks", "table")
+                    .with_column("pk", bytes_type, column_kind::partition_key)
+                    .with_column("v1", bytes_type)
+                    .build();
+
+            auto s1 = schema_builder("ks", "table")
+                    .with_column("pk", bytes_type, column_kind::partition_key)
+                    .with_column("v1", bytes_type)
+                    .with_column("v3", bytes_type)
+                    .build();
+
+            auto s2 = schema_builder("ks", "table", stdx::make_optional(s1->id()))
+                    .with_column("pk", bytes_type, column_kind::partition_key)
+                    .with_column("v1", bytes_type)
+                    .with_column("v2", bytes_type)
+                    .build();
+
+            mm.announce_new_column_family(s1, false).get();
+            auto old_version = e.db().local().find_schema(s1->id())->version();
+
+            // Apply s0 -> s2 change.
+            {
+                auto&& keyspace = e.db().local().find_keyspace(s0->ks_name()).metadata();
+                auto muts = db::schema_tables::make_update_table_mutations(keyspace, s0, s2,
+                    api::new_timestamp(), false).get0();
+                mm.announce(std::move(muts), true).get();
+            }
+
+            auto new_schema = e.db().local().find_schema(s1->id());
+
+            BOOST_REQUIRE(new_schema->get_column_definition(to_bytes("v1")) != nullptr);
+            BOOST_REQUIRE(new_schema->get_column_definition(to_bytes("v2")) != nullptr);
+            BOOST_REQUIRE(new_schema->get_column_definition(to_bytes("v3")) != nullptr);
+
+            BOOST_REQUIRE(new_schema->version() != old_version);
+            BOOST_REQUIRE(new_schema->version() != s2->version());
+        });
+    });
+}
+
 SEASTAR_TEST_CASE(test_column_is_dropped) {
    return do_with_cql_env([](cql_test_env& e) {
        return seastar::async([&] {
@@ -146,6 +193,59 @@ SEASTAR_TEST_CASE(test_column_is_dropped) {
    });
 }

+SEASTAR_TEST_CASE(test_combined_column_add_and_drop) {
+    return do_with_cql_env([](cql_test_env& e) {
+        return seastar::async([&] {
+            service::migration_manager& mm = service::get_local_migration_manager();
+
+            e.execute_cql("create keyspace tests with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };").get();
+
+            auto s1 = schema_builder("ks", "table1")
+                    .with_column("pk", bytes_type, column_kind::partition_key)
+                    .with_column("v1", bytes_type)
+                    .build();
+
+            mm.announce_new_column_family(s1, false).get();
+
+            auto&& keyspace = e.db().local().find_keyspace(s1->ks_name()).metadata();
+
+            auto s2 = schema_builder("ks", "table1", stdx::make_optional(s1->id()))
+                    .with_column("pk", bytes_type, column_kind::partition_key)
+                    .without_column("v1", bytes_type, api::new_timestamp())
+                    .build();
+
+            // Drop v1
+            {
+                auto muts = db::schema_tables::make_update_table_mutations(keyspace, s1, s2,
+                    api::new_timestamp(), false).get0();
+                mm.announce(std::move(muts), true).get();
+            }
+
+            // Add a new v1 and drop it
+            {
+                auto s3 = schema_builder("ks", "table1", stdx::make_optional(s1->id()))
+                        .with_column("pk", bytes_type, column_kind::partition_key)
+                        .with_column("v1", list_type_impl::get_instance(int32_type, true))
+                        .build();
+
+                auto s4 = schema_builder("ks", "table1", stdx::make_optional(s1->id()))
+                        .with_column("pk", bytes_type, column_kind::partition_key)
+                        .without_column("v1", list_type_impl::get_instance(int32_type, true), api::new_timestamp())
+                        .build();
+
+                auto muts = db::schema_tables::make_update_table_mutations(keyspace, s3, s4,
+                    api::new_timestamp(), false).get0();
+                mm.announce(std::move(muts), true).get();
+            }
+
+            auto new_schema = e.db().local().find_schema(s1->id());
+            BOOST_REQUIRE(new_schema->get_column_definition(to_bytes("v1")) == nullptr);
+
+            assert_that_failed(e.execute_cql("alter table ks.table1 add v1 list<text>;"));
+        });
+    });
+}
+
 class counting_migration_listener : public service::migration_listener {
 public:
    int create_keyspace_count = 0;
--- a/tests/sstable_assertions.hh
+++ b/tests/sstable_assertions.hh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "dht/i_partitioner.hh"
+#include "schema.hh"
+#include "sstables/index_reader.hh"
+
+class index_reader_assertions {
+    std::unique_ptr<sstables::index_reader> _r;
+public:
+    index_reader_assertions(std::unique_ptr<sstables::index_reader> r)
+        : _r(std::move(r))
+    { }
+
+    index_reader_assertions& has_monotonic_positions(const schema& s) {
+        auto pos_cmp = position_in_partition::composite_less_compare(s);
+        auto rp_cmp = dht::ring_position_comparator(s);
+        auto prev = dht::ring_position::min();
+        _r->read_partition_data().get();
+        while (!_r->eof()) {
+            auto k = _r->current_partition_entry().get_decorated_key();
+            auto rp = dht::ring_position(k.token(), k.key().to_partition_key(s));
+
+            if (!rp_cmp(prev, rp)) {
+                BOOST_FAIL(sprint("Partitions have invalid order: %s >= %s", prev, rp));
+            }
+
+            prev = rp;
+
+            auto* pi = _r->current_partition_entry().get_promoted_index(s);
+            if (!pi->entries.empty()) {
+                auto& prev = pi->entries[0];
+                for (size_t i = 1; i < pi->entries.size(); ++i) {
+                    auto& cur = pi->entries[i];
+                    if (!pos_cmp(prev.end, cur.start)) {
+                        std::cout << "promoted index:\n";
+                        for (auto& e : pi->entries) {
+                            std::cout << "  " << e.start << "-" << e.end << ": +" << e.offset << " len=" << e.width << std::endl;
+                        }
+                        BOOST_FAIL(sprint("Index blocks are not monotonic: %s >= %s", prev.end, cur.start));
+                    }
+                    cur = prev;
+                }
+            }
+            _r->advance_to_next_partition().get();
+        }
+        return *this;
+    }
+};
+
+inline
+index_reader_assertions assert_that(std::unique_ptr<sstables::index_reader> r) {
+    return { std::move(r) };
+}
--- a/tests/sstable_datafile_test.cc
+++ b/tests/sstable_datafile_test.cc
@@ -1463,11 +1463,12 @@ SEASTAR_TEST_CASE(datafile_generation_39) {
        auto key = partition_key::from_exploded(*s, {to_bytes("key1")});
        mutation m(key, s);

-        const column_definition& cl1 = *s->get_column_definition("cl1");
+        auto c_key = clustering_key::make_empty();

-        m.set_static_cell(cl1, make_atomic_cell(bytes_type->decompose(data_value(to_bytes("cl1")))));
+        const column_definition& cl1 = *s->get_column_definition("cl1");
+        m.set_clustered_cell(c_key, cl1, make_atomic_cell(bytes_type->decompose(data_value(to_bytes("cl1")))));
        const column_definition& cl2 = *s->get_column_definition("cl2");
-        m.set_static_cell(cl2, make_atomic_cell(bytes_type->decompose(data_value(to_bytes("cl2")))));
+        m.set_clustered_cell(c_key, cl2, make_atomic_cell(bytes_type->decompose(data_value(to_bytes("cl2")))));
        mtp->apply(std::move(m));

        auto sst = make_lw_shared<sstable>(s, "tests/sstables/tests-temporary", 39, la, big);
@@ -1477,9 +1478,10 @@ SEASTAR_TEST_CASE(datafile_generation_39) {
                    return sstp->read_row(s, key).then([] (auto sm) {
                        return mutation_from_streamed_mutation(std::move(sm));
                    }).then([sstp, s] (auto mutation) {
-                        auto row = mutation->partition().static_row();
-                        match_live_cell(row, *s, "cl1", data_value(data_value(to_bytes("cl1"))));
-                        match_live_cell(row, *s, "cl2", data_value(data_value(to_bytes("cl2"))));
+                        auto& mp = mutation->partition();
+                        auto row = mp.clustered_row(*s, clustering_key::make_empty());
+                        match_live_cell(row.cells(), *s, "cl1", data_value(data_value(to_bytes("cl1"))));
+                        match_live_cell(row.cells(), *s, "cl2", data_value(data_value(to_bytes("cl2"))));
                        return make_ready_future<>();
                    });
                });
@@ -3610,10 +3612,28 @@ SEASTAR_TEST_CASE(test_skipping_using_index) {
    });
 }

+static void copy_directory(boost::filesystem::path src_dir, boost::filesystem::path dst_dir) {
+    namespace fs = boost::filesystem;
+    fs::create_directory(dst_dir);
+    auto src_dir_components = std::distance(src_dir.begin(), src_dir.end());
+    using rdi = fs::recursive_directory_iterator;
+    // Boost 1.55.0 doesn't support range for on recursive_directory_iterator
+    // (even though previous and later versions do support it)
+    for (auto&& dirent = rdi{src_dir}; dirent != rdi(); ++dirent) {
+        auto&& path = dirent->path();
+        auto new_path = dst_dir;
+        for (auto i = std::next(path.begin(), src_dir_components); i != path.end(); ++i) {
+            new_path /= *i;
+        }
+        fs::copy(path, new_path);
+    }
+}
+
 SEASTAR_TEST_CASE(test_unknown_component) {
    return seastar::async([] {
        auto tmp = make_lw_shared<tmpdir>();
-        auto sstp = reusable_sst(uncompressed_schema(), "tests/sstables/unknown_component", 1).get0();
+        copy_directory("tests/sstables/unknown_component", std::string(tmp->path) + "/unknown_component");
+        auto sstp = reusable_sst(uncompressed_schema(), tmp->path + "/unknown_component", 1).get0();
        sstp->create_links(tmp->path).get();
        // check that create_links() moved unknown component to new dir
        BOOST_REQUIRE(file_exists(tmp->path + "/la-1-big-UNKNOWN.txt").get0());
--- a/tests/sstable_mutation_test.cc
+++ b/tests/sstable_mutation_test.cc
@@ -35,6 +35,7 @@
 #include "tmpdir.hh"
 #include "memtable-sstable.hh"
 #include "disk-error-handler.hh"
+#include "tests/sstable_assertions.hh"

 thread_local disk_error_signal_type commit_error;
 thread_local disk_error_signal_type general_disk_error;
@@ -442,9 +443,10 @@ SEASTAR_TEST_CASE(compact_storage_sparse_read) {
            return sstp->read_row(s, key).then([] (auto sm) {
                return mutation_from_streamed_mutation(std::move(sm));
            }).then([sstp, s, &key] (auto mutation) {
-                auto sr = mutation->partition().static_row();
-                match_live_cell(sr, *s, "cl1", data_value(to_bytes("cl1")));
-                match_live_cell(sr, *s, "cl2", data_value(to_bytes("cl2")));
+                auto& mp = mutation->partition();
+                auto row = mp.clustered_row(*s, clustering_key::make_empty());
+                match_live_cell(row.cells(), *s, "cl1", data_value(to_bytes("cl1")));
+                match_live_cell(row.cells(), *s, "cl2", data_value(to_bytes("cl2")));
                return make_ready_future<>();
            });
        });
@@ -800,3 +802,52 @@ SEASTAR_TEST_CASE(test_non_compound_table_row_is_not_marked_as_static) {
        BOOST_REQUIRE(bool(mut));
    });
 }
+
+SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic) {
+    return seastar::async([] {
+        auto dir = make_lw_shared<tmpdir>();
+        schema_builder builder("ks", "cf");
+        builder.with_column("p", utf8_type, column_kind::partition_key);
+        builder.with_column("c1", int32_type, column_kind::clustering_key);
+        builder.with_column("c2", int32_type, column_kind::clustering_key);
+        builder.with_column("v", int32_type);
+        auto s = builder.build();
+
+        auto k = partition_key::from_exploded(*s, {to_bytes("key1")});
+        auto cell = atomic_cell::make_live(1, int32_type->decompose(88), { });
+        mutation m(k, s);
+
+        auto ck = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(2)});
+        m.set_clustered_cell(ck, *s->get_column_definition("v"), cell);
+
+        ck = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(4)});
+        m.set_clustered_cell(ck, *s->get_column_definition("v"), cell);
+
+        ck = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(6)});
+        m.set_clustered_cell(ck, *s->get_column_definition("v"), cell);
+
+        ck = clustering_key::from_exploded(*s, {int32_type->decompose(3), int32_type->decompose(9)});
+        m.set_clustered_cell(ck, *s->get_column_definition("v"), cell);
+
+        m.partition().apply_row_tombstone(*s, range_tombstone(
+                clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}),
+                bound_kind::excl_start,
+                clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}),
+                bound_kind::incl_end,
+                {1, gc_clock::now()}));
+
+        auto mt = make_lw_shared<memtable>(s);
+        mt->apply(std::move(m));
+
+        auto sst = make_lw_shared<sstables::sstable>(s,
+                                dir->path,
+                                1 /* generation */,
+                                sstables::sstable::version_types::ka,
+                                sstables::sstable::format_types::big);
+        sstable_writer_config cfg;
+        cfg.promoted_index_block_size = 1;
+        sst->write_components(mt->make_reader(s), 1, s, cfg).get();
+        sst->load().get();
+        assert_that(sst->get_index_reader(default_priority_class())).has_monotonic_positions(*s);
+    });
+}
--- a/tests/streamed_mutation_test.cc
+++ b/tests/streamed_mutation_test.cc
@@ -243,7 +243,8 @@ SEASTAR_TEST_CASE(test_fragmenting_and_freezing_streamed_mutations) {
                return make_ready_future<>();
            }, 1).get0();

-            auto expected_fragments = boost::size(m.partition().non_dummy_rows())
+            auto&& rows = m.partition().non_dummy_rows();
+            auto expected_fragments = std::distance(rows.begin(), rows.end())
                                      + m.partition().row_tombstones().size()
                                      + !m.partition().static_row().empty();
            BOOST_REQUIRE_EQUAL(fms.size(), std::max(expected_fragments, size_t(1)));
--- a/tests/tmpdir.hh
+++ b/tests/tmpdir.hh
@@ -28,7 +28,7 @@
 // automatically when tmpdir object goes out of scope.
 struct tmpdir {
    tmpdir() {
-        char tmp[] = "tmpdir_XXXXXX";
+        char tmp[] = "/tmp/tmpdir_XXXXXX";
        auto * dir = ::mkdtemp(tmp);
        if (dir == NULL) {
            throw std::runtime_error("Could not create temp dir");
--- a/thrift/handler.cc
+++ b/thrift/handler.cc
@@ -324,7 +324,13 @@ public:
            auto cmd = slice_pred_to_read_cmd(*schema, predicate);
            // KeyRange::count is the number of thrift rows to return, while
            // SlicePredicte::slice_range::count limits the number of thrift colums.
-            cmd->partition_limit = range.count;
+            if (schema->thrift().is_dynamic()) {
+                // For dynamic CFs we must limit the number of partitions returned.
+                cmd->partition_limit = range.count;
+            } else {
+                // For static CFs each thrift row maps to a CQL row.
+                cmd->row_limit = range.count;
+            }
            auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT);
            return f.then([schema, cmd, prange = std::move(prange), consistency_level] () mutable {
                return service::get_local_storage_proxy().query(
@@ -345,7 +351,6 @@ public:
        auto opts = query_opts(s);
        std::vector<query::clustering_range> clustering_ranges;
        std::vector<column_id> regular_columns;
-        std::vector<column_id> static_columns;
        uint32_t row_limit;
        uint32_t partition_limit;
        std::unique_ptr<query::specific_ranges> specific_ranges = nullptr;
@@ -365,14 +370,14 @@ public:
            // we ask for as many partitions as those that are capable of exhausting the limit and later filter out
            // any excess cells.
            row_limit = query::max_rows;
-            partition_limit = (column_limit + s.static_columns_count() - 1) / s.static_columns_count();
+            partition_limit = (column_limit + s.regular_columns_count() - 1) / s.regular_columns_count();
            schema::const_iterator start_col = start_column
-                                             ? s.static_lower_bound(to_bytes(*start_column))
-                                             : s.static_begin();
-            static_columns = add_columns(start_col, s.static_end(), false);
+                                             ? s.regular_lower_bound(to_bytes(*start_column))
+                                             : s.regular_begin();
+            regular_columns = add_columns(start_col, s.regular_end(), false);
        }
        clustering_ranges.emplace_back(query::clustering_range::make_open_ended_both_sides());
-        auto slice = query::partition_slice(std::move(clustering_ranges), std::move(static_columns), std::move(regular_columns), opts,
+        auto slice = query::partition_slice(std::move(clustering_ranges), { }, std::move(regular_columns), opts,
                std::move(specific_ranges), cql_serialization_format::internal());
        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), row_limit, gc_clock::now(), stdx::nullopt, partition_limit);
    }
@@ -599,7 +604,6 @@ public:
            auto pk = key_from_thrift(s, to_bytes(request.key));
            auto dk = dht::global_partitioner().decorate_key(s, pk);
            std::vector<column_id> regular_columns;
-            std::vector<column_id> static_columns;
            std::vector<query::clustering_range> clustering_ranges;
            auto opts = query_opts(s);
            uint32_t row_limit;
@@ -624,10 +628,9 @@ public:
                    return make_range(cslice.start, cslice.finish);
                }, cmp, [&](auto& range) { return range.is_wrap_around(cmp); }, request.reversed);
                auto on_range = [&](auto&& range) {
-                    auto start = range.start() ? s.static_lower_bound(range.start()->value()) : s.static_begin();
-                    auto end  = range.end() ? s.static_upper_bound(range.end()->value()) : s.static_end();
-                    auto cols = add_columns(start, end, request.reversed);
-                    std::move(cols.begin(), cols.end(), std::back_inserter(static_columns));
+                    auto start = range.start() ? s.regular_lower_bound(range.start()->value()) : s.regular_begin();
+                    auto end  = range.end() ? s.regular_upper_bound(range.end()->value()) : s.regular_end();
+                    regular_columns = add_columns(start, end, request.reversed);
                };
                if (request.reversed) {
                    std::for_each(ranges.rbegin(), ranges.rend(), on_range);
@@ -635,7 +638,7 @@ public:
                    std::for_each(ranges.begin(), ranges.end(), on_range);
                }
            }
-            auto slice = query::partition_slice(std::move(clustering_ranges), std::move(static_columns), std::move(regular_columns), opts, nullptr);
+            auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts, nullptr);
            auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), row_limit);
            auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT);
            return f.then([dk = std::move(dk), cmd, schema, column_limit = request.count, cl = request.consistency_level] {
@@ -894,7 +897,7 @@ public:
            }

            auto s = schema_from_thrift(cf_def, cf_def.keyspace, schema->id());
-            if (schema->thrift().is_dynamic() && !s->thrift().is_dynamic()) {
+            if (schema->thrift().is_dynamic() != s->thrift().is_dynamic()) {
                fail(unimplemented::cause::MIXED_CF);
            }
            return _query_state.get_client_state().has_schema_access(*schema, auth::permission::ALTER).then([this, s = std::move(s)] {
@@ -1120,16 +1123,12 @@ private:
            cf_def.__set_keyspace(s->ks_name());
            cf_def.__set_name(s->cf_name());
            cf_def.__set_column_type(cf_type_to_sstring(s->type()));
-            if (s->clustering_key_size()) {
-                cf_def.__set_comparator_type(class_from_compound_type(*s->clustering_key_type()));
-            } else {
-                cf_def.__set_comparator_type(s->regular_column_name_type()->name());
-            }
+            cf_def.__set_comparator_type(cell_comparator::to_sstring(*s));
            cf_def.__set_comment(s->comment());
            cf_def.__set_read_repair_chance(s->read_repair_chance());
            std::vector<ColumnDef> columns;
            if (!s->thrift().is_dynamic()) {
-                for (auto&& c : s->static_columns()) {
+                for (auto&& c : s->regular_columns()) {
                    ColumnDef c_def;
                    c_def.__set_name(c.name_as_text());
                    c_def.__set_validation_class(c.type->name());
@@ -1201,41 +1200,37 @@ private:
            builder.with_column(to_bytes(names.partition_key_name()), bytes_type, column_kind::partition_key);
        }

-        data_type regular_column_name_type;
+        auto default_validator = cf_def.__isset.default_validation_class
+            ? db::marshal::type_parser::parse(to_sstring(cf_def.default_validation_class))
+            : bytes_type;
+
        if (cf_def.column_metadata.empty()) {
            // Dynamic CF
            builder.set_is_dense(true);
-            regular_column_name_type = utf8_type;
            auto p = get_types(cf_def.comparator_type);
            auto ck_types = std::move(p.first);
            builder.set_is_compound(p.second);
            for (uint32_t i = 0; i < ck_types.size(); ++i) {
                builder.with_column(to_bytes(names.clustering_name()), std::move(ck_types[i]), column_kind::clustering_key);
            }
-            auto&& vtype = cf_def.__isset.default_validation_class
-                         ? db::marshal::type_parser::parse(to_sstring(cf_def.default_validation_class))
-                         : bytes_type;
-            builder.with_column(to_bytes(names.compact_value_name()), std::move(vtype));
+            builder.with_column(to_bytes(names.compact_value_name()), default_validator);
        } else {
            // Static CF
            builder.set_is_compound(false);
-            regular_column_name_type = db::marshal::type_parser::parse(to_sstring(cf_def.comparator_type));
+            auto column_name_type = db::marshal::type_parser::parse(to_sstring(cf_def.comparator_type));
            for (const ColumnDef& col_def : cf_def.column_metadata) {
                auto col_name = to_bytes(col_def.name);
-                regular_column_name_type->validate(col_name);
+                column_name_type->validate(col_name);
                builder.with_column(std::move(col_name), db::marshal::type_parser::parse(to_sstring(col_def.validation_class)),
-                                column_kind::static_column);
+                                    column_kind::regular_column);
                auto index = index_metadata_from_thrift(col_def);
                if (index) {
                    builder.with_index(index.value());
                }
            }
-            // CMH composite? Origin seemingly allows composite comparator_type.
-            builder.with_column(to_bytes(names.clustering_name()), regular_column_name_type, column_kind::clustering_key);
-            builder.with_column(to_bytes(names.compact_value_name()), db::marshal::type_parser::parse(to_sstring(cf_def.default_validation_class)));
-
+            builder.set_regular_column_name_type(column_name_type);
        }
-        builder.set_regular_column_name_type(regular_column_name_type);
+        builder.set_default_validation_class(default_validator);
        if (cf_def.__isset.comment) {
            builder.set_comment(cf_def.comment);
        }
@@ -1395,8 +1390,8 @@ private:
        return { std::move(start_bound), std::move(end_bound) };
    }
    static std::pair<schema::const_iterator, schema::const_iterator> make_column_range(const schema& s, const std::string& start, const std::string& end) {
-        auto start_it = start.empty() ? s.static_begin() : s.static_lower_bound(to_bytes(start));
-        auto end_it = end.empty() ? s.static_end() : s.static_upper_bound(to_bytes(end));
+        auto start_it = start.empty() ? s.regular_begin() : s.regular_lower_bound(to_bytes(start));
+        auto end_it = end.empty() ? s.regular_end() : s.regular_upper_bound(to_bytes(end));
        if (start_it > end_it) {
            throw make_exception<InvalidRequestException>("Range finish must come after start in the order of traversal");
        }
@@ -1428,7 +1423,6 @@ private:
        auto opts = query_opts(s);
        std::vector<query::clustering_range> clustering_ranges;
        std::vector<column_id> regular_columns;
-        std::vector<column_id> static_columns;
        uint32_t per_partition_row_limit = query::max_rows;
        if (predicate.__isset.column_names) {
            thrift_validation::validate_column_names(predicate.column_names);
@@ -1443,9 +1437,9 @@ private:
                clustering_ranges.emplace_back(query::clustering_range::make_open_ended_both_sides());
                auto&& defs = unique_column_names
                    | boost::adaptors::transformed([&s](auto&& name) { return s.get_column_definition(to_bytes(name)); })
-                    | boost::adaptors::filtered([](auto* def) { return def && def->is_static(); })
+                    | boost::adaptors::filtered([](auto* def) { return def; })
                    | boost::adaptors::indirected;
-                static_columns = add_columns(defs.begin(), defs.end(), false);
+                regular_columns = add_columns(defs.begin(), defs.end(), false);
            }
        } else if (predicate.__isset.slice_range) {
            auto range = predicate.slice_range;
@@ -1465,12 +1459,12 @@ private:
                auto r = make_column_range(s, range.start, range.finish);
                // For static CFs, the limit is enforced on the result as we do not implement
                // a cell limit in the database engine.
-                static_columns = add_columns(r.first, r.second, range.reversed);
+                regular_columns = add_columns(r.first, r.second, range.reversed);
            }
        } else {
            throw make_exception<InvalidRequestException>("SlicePredicate column_names and slice_range may not both be null");
        }
-        auto slice = query::partition_slice(std::move(clustering_ranges), std::move(static_columns), std::move(regular_columns), opts,
+        auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts,
                nullptr, cql_serialization_format::internal(), per_partition_row_limit);
        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice));
    }
@@ -1565,33 +1559,26 @@ private:
            abort();
        }
        void accept_new_row(const clustering_key_prefix& key, const query::result_row_view& static_row, const query::result_row_view& row) {
-            std::cout << "accept new row\n";
            auto it = row.iterator();
            auto cell = it.next_atomic_cell();
            if (cell && _current_cell_limit > 0) {
-                std::cout << "has normal cell?!\n";
                bytes column_name = composite::serialize_value(key.components(), _s.thrift().has_compound_comparator()).release_bytes();
                Aggregator::on_column(_current_aggregation, column_name, *cell);
                _current_cell_limit -= 1;
            }
-            accept_partition_end(static_row);
        }
        void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) {
-            std::cout << "accept new row static\n";
-            accept_partition_end(static_row);
-        }
-        void accept_partition_end(const query::result_row_view& static_row) {
-            std::cout << "accept row partition end\n";
-            auto it = static_row.iterator();
-            for (auto&& id : _slice.static_columns) {
-                std::cout << "going over the static cols\n";
+            auto it = row.iterator();
+            for (auto&& id : _slice.regular_columns) {
                auto cell = it.next_atomic_cell();
                if (cell && _current_cell_limit > 0) {
-                    Aggregator::on_column(_current_aggregation, _s.static_column_at(id).name(), *cell);
+                    Aggregator::on_column(_current_aggregation, _s.regular_column_at(id).name(), *cell);
                    _current_cell_limit -= 1;
                }
            }
        }
+        void accept_partition_end(const query::result_row_view& static_row) {
+        }
    };
    struct column_or_supercolumn_builder {
        using type = std::vector<ColumnOrSuperColumn>;
@@ -1740,7 +1727,7 @@ private:
    static void delete_cell(const column_definition& def, api::timestamp_type timestamp, gc_clock::time_point deletion_time, mutation& m_to_apply) {
        if (def.is_atomic()) {
            auto dead_cell = atomic_cell::make_dead(timestamp, deletion_time);
-            m_to_apply.set_static_cell(def, std::move(dead_cell));
+            m_to_apply.set_clustered_cell(clustering_key_prefix::make_empty(), def, std::move(dead_cell));
        }
    }
    static void delete_column(const schema& s, const sstring& column_name, api::timestamp_type timestamp, gc_clock::time_point deletion_time, mutation& m_to_apply) {
@@ -1777,21 +1764,11 @@ private:
            throw make_exception<InvalidRequestException>("SlicePredicate column_names and slice_range may not both be null");
        }
    }
-    static void add_live_cell(const schema& s, const Column& col, const column_definition& def, mutation& m_to_apply) {
-        thrift_validation::validate_column(col, def);
-        auto cell = atomic_cell::make_live(col.timestamp, to_bytes_view(col.value), maybe_ttl(s, col));
-        m_to_apply.set_static_cell(def, std::move(cell));
-    }
    static void add_live_cell(const schema& s, const Column& col, const column_definition& def, clustering_key_prefix ckey, mutation& m_to_apply) {
        thrift_validation::validate_column(col, def);
        auto cell = atomic_cell::make_live(col.timestamp, to_bytes_view(col.value), maybe_ttl(s, col));
        m_to_apply.set_clustered_cell(std::move(ckey), def, std::move(cell));
    }
-    static void add_live_cell(const schema& s, const CounterColumn& col, const column_definition& def, mutation& m_to_apply) {
-        //thrift_validation::validate_column(col, def);
-        auto cell = atomic_cell::make_live_counter_update(api::new_timestamp(), col.value);
-        m_to_apply.set_static_cell(def, std::move(cell));
-    }
    static void add_live_cell(const schema& s, const CounterColumn& col, const column_definition& def, clustering_key_prefix ckey, mutation& m_to_apply) {
        //thrift_validation::validate_column(col, def);
        auto cell = atomic_cell::make_live_counter_update(api::new_timestamp(), col.value);
@@ -1805,10 +1782,10 @@ private:
        } else {
            auto def = s.get_column_definition(to_bytes(col.name));
            if (def) {
-                if (def->kind != column_kind::static_column) {
+                if (def->kind != column_kind::regular_column) {
                    throw make_exception<InvalidRequestException>("Column %s is not settable", col.name);
                }
-                add_live_cell(s, col, *def, m_to_apply);
+                add_live_cell(s, col, *def, clustering_key_prefix::make_empty(s), m_to_apply);
            } else {
                fail(unimplemented::cause::MIXED_CF);
            }
@@ -1822,10 +1799,10 @@ private:
        } else {
            auto def = s.get_column_definition(to_bytes(col.name));
            if (def) {
-                if (def->kind != column_kind::static_column) {
+                if (def->kind != column_kind::regular_column) {
                    throw make_exception<InvalidRequestException>("Column %s is not settable", col.name);
                }
-                add_live_cell(s, col, *def, m_to_apply);
+                add_live_cell(s, col, *def, clustering_key_prefix::make_empty(s), m_to_apply);
            } else {
                fail(unimplemented::cause::MIXED_CF);
            }
--- a/thrift/thrift_validation.cc
+++ b/thrift/thrift_validation.cc
@@ -68,7 +68,7 @@ void validate_keyspace_not_system(const std::string& keyspace) {
    std::string name;
    name.resize(keyspace.length());
    std::transform(keyspace.begin(), keyspace.end(), name.begin(), ::tolower);
-    if (name == db::system_keyspace::NAME) {
+    if (is_system_keyspace(name)) {
        throw make_exception<InvalidRequestException>("system keyspace is not user-modifiable");
    }
 }
--- a/Show More
+++ b/Show More