release: prepare for 1.3.3

auth: Fix resource level handling
We use `data_resource` class in the CQL parser, which let's users refer to a table resource without specifying a keyspace. This asserts out in get_level() for no good reason as we already know the intented level based on the constructor. Therefore, change `data_resource` to track the level like upstream Cassandra does and use that. Fixes #1790 Message-Id: <1477599169-2945-1-git-send-email-penberg@scylladb.com> (cherry picked from commit b54870764f)
2016-10-28 09:54:41 +03:00 · 2016-10-27 23:38:01 +03:00 · 2016-10-27 22:10:04 +03:00 · 2016-10-25 13:56:36 +02:00 · 2016-10-18 11:00:19 +02:00 · 2016-10-14 13:28:32 +03:00
136 changed files with 3554 additions and 938 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=1.3.3

 if test -f version
 then
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -219,8 +219,9 @@ static future<json::json_return_type>  sum_sstable(http_context& ctx, const sstr
    auto uuid = get_uuid(name, ctx.db.local());
    return ctx.db.map_reduce0([uuid, total](database& db) {
        std::unordered_map<sstring, uint64_t> m;
-        for (auto t :*((total) ? db.find_column_family(uuid).get_sstables_including_compacted_undeleted() :
-                db.find_column_family(uuid).get_sstables()).get()) {
+        auto sstables = (total) ? db.find_column_family(uuid).get_sstables_including_compacted_undeleted() :
+                db.find_column_family(uuid).get_sstables();
+        for (auto t : *sstables) {
            m[t->get_filename()] = t->bytes_on_disk();
        }
        return m;
@@ -234,8 +235,9 @@ static future<json::json_return_type>  sum_sstable(http_context& ctx, const sstr
 static future<json::json_return_type> sum_sstable(http_context& ctx, bool total) {
    return map_reduce_cf_raw(ctx, std::unordered_map<sstring, uint64_t>(), [total](column_family& cf) {
        std::unordered_map<sstring, uint64_t> m;
-        for (auto t :*((total) ? cf.get_sstables_including_compacted_undeleted() :
-                cf.get_sstables()).get()) {
+        auto sstables = (total) ? cf.get_sstables_including_compacted_undeleted() :
+                cf.get_sstables();
+        for (auto t : *sstables) {
            m[t->get_filename()] = t->bytes_on_disk();
        }
        return m;
@@ -273,6 +275,14 @@ static double get_compression_ratio(column_family& cf) {
    return std::move(result).get();
 }

+static std::vector<uint64_t> concat_sstable_count_per_level(std::vector<uint64_t> a, std::vector<uint64_t>&& b) {
+    a.resize(std::max(a.size(), b.size()), 0UL);
+    for (auto i = 0U; i < b.size(); i++) {
+        a[i] += b[i];
+    }
+    return a;
+}
+
 void set_column_family(http_context& ctx, routes& r) {
    cf::get_column_family_name.set(r, [&ctx] (const_req req){
        vector<sstring> res;
@@ -898,12 +908,11 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_sstable_count_per_level.set(r, [&ctx](std::unique_ptr<request> req) {
-        // TBD
-        // FIXME
-        // This is a workaround, until there will be an API to return the count
-        // per level, we return an empty array
-        vector<uint64_t> res;
-        return make_ready_future<json::json_return_type>(res);
+        return map_reduce_cf_raw(ctx, req->param["name"], std::vector<uint64_t>(), [](const column_family& cf) {
+            return cf.sstable_count_per_level();
+        }, concat_sstable_count_per_level).then([](const std::vector<uint64_t>& res) {
+            return make_ready_future<json::json_return_type>(res);
+        });
    });
 }
 }
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -22,6 +22,7 @@
 #include "compaction_manager.hh"
 #include "api/api-doc/compaction_manager.json.hh"
 #include "db/system_keyspace.hh"
+#include "column_family.hh"

 namespace api {

@@ -78,7 +79,9 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    });

    cm::get_pending_tasks.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cm_stats(ctx, &compaction_manager::stats::pending_tasks);
+        return map_reduce_cf(ctx, int64_t(0), [](column_family& cf) {
+            return cf.get_compaction_strategy().estimated_pending_compactions(cf);
+        }, std::plus<int64_t>());
    });

    cm::get_completed_tasks.set(r, [&ctx] (std::unique_ptr<request> req) {
--- a/auth/data_resource.cc
+++ b/auth/data_resource.cc
@@ -47,11 +47,8 @@
 const sstring auth::data_resource::ROOT_NAME("data");

 auth::data_resource::data_resource(level l, const sstring& ks, const sstring& cf)
-    : _ks(ks), _cf(cf)
+    : _level(l), _ks(ks), _cf(cf)
 {
-    if (l != get_level()) {
-        throw std::invalid_argument("level/keyspace/column mismatch");
-    }
 }

 auth::data_resource::data_resource()
@@ -67,14 +64,7 @@ auth::data_resource::data_resource(const sstring& ks, const sstring& cf)
 {}

 auth::data_resource::level auth::data_resource::get_level() const {
-    if (!_cf.empty()) {
-        assert(!_ks.empty());
-        return level::COLUMN_FAMILY;
-    }
-    if (!_ks.empty()) {
-        return level::KEYSPACE;
-    }
-    return level::ROOT;
+    return _level;
 }

 auth::data_resource auth::data_resource::from_name(
--- a/auth/data_resource.hh
+++ b/auth/data_resource.hh
@@ -56,6 +56,7 @@ private:

    static const sstring ROOT_NAME;

+    level _level;
    sstring _ks;
    sstring _cf;

--- a/auth/permission.cc
+++ b/auth/permission.cc
@@ -40,6 +40,7 @@
 */

 #include <unordered_map>
+#include <boost/algorithm/string.hpp>
 #include "permission.hh"

 const auth::permission_set auth::permissions::ALL_DATA =
@@ -75,7 +76,9 @@ const sstring& auth::permissions::to_string(permission p) {
 }

 auth::permission auth::permissions::from_string(const sstring& s) {
-    return permission_names.at(s);
+    sstring upper(s);
+    boost::to_upper(upper);
+    return permission_names.at(upper);
 }

 std::unordered_set<sstring> auth::permissions::to_strings(const permission_set& set) {
--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -28,7 +28,11 @@ class checked_file_impl : public file_impl {
 public:

    checked_file_impl(disk_error_signal_type& s, file f)
-            : _signal(s) , _file(f) {}
+            : _signal(s) , _file(f) {
+        _memory_dma_alignment = f.memory_dma_alignment();
+        _disk_read_dma_alignment = f.disk_read_dma_alignment();
+        _disk_write_dma_alignment = f.disk_write_dma_alignment();
+    }

    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
        return do_io_check(_signal, [&] {
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -0,0 +1,127 @@
+
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "keys.hh"
+#include "schema.hh"
+#include "range.hh"
+
+/**
+ * Represents the kind of bound in a range tombstone.
+ */
+enum class bound_kind : uint8_t {
+    excl_end = 0,
+    incl_start = 1,
+    // values 2 to 5 are reserved for forward Origin compatibility
+    incl_end = 6,
+    excl_start = 7,
+};
+
+std::ostream& operator<<(std::ostream& out, const bound_kind k);
+
+bound_kind invert_kind(bound_kind k);
+int32_t weight(bound_kind k);
+
+static inline bound_kind flip_bound_kind(bound_kind bk)
+{
+    switch (bk) {
+    case bound_kind::excl_end: return bound_kind::excl_start;
+    case bound_kind::incl_end: return bound_kind::incl_start;
+    case bound_kind::excl_start: return bound_kind::excl_end;
+    case bound_kind::incl_start: return bound_kind::incl_end;
+    }
+    abort();
+}
+
+class bound_view {
+    const static thread_local clustering_key empty_prefix;
+public:
+    const clustering_key_prefix& prefix;
+    bound_kind kind;
+    bound_view(const clustering_key_prefix& prefix, bound_kind kind)
+        : prefix(prefix)
+        , kind(kind)
+    { }
+    struct compare {
+        // To make it assignable and to avoid taking a schema_ptr, we
+        // wrap the schema reference.
+        std::reference_wrapper<const schema> _s;
+        compare(const schema& s) : _s(s)
+        { }
+        bool operator()(const clustering_key_prefix& p1, int32_t w1, const clustering_key_prefix& p2, int32_t w2) const {
+            auto type = _s.get().clustering_key_prefix_type();
+            auto res = prefix_equality_tri_compare(type->types().begin(),
+                type->begin(p1), type->end(p1),
+                type->begin(p2), type->end(p2),
+                tri_compare);
+            if (res) {
+                return res < 0;
+            }
+            auto d1 = p1.size(_s);
+            auto d2 = p2.size(_s);
+            if (d1 == d2) {
+                return w1 < w2;
+            }
+            return d1 < d2 ? w1 <= 0 : w2 > 0;
+        }
+        bool operator()(const bound_view b, const clustering_key_prefix& p) const {
+            return operator()(b.prefix, weight(b.kind), p, 0);
+        }
+        bool operator()(const clustering_key_prefix& p, const bound_view b) const {
+            return operator()(p, 0, b.prefix, weight(b.kind));
+        }
+        bool operator()(const bound_view b1, const bound_view b2) const {
+            return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
+        }
+    };
+    bool equal(const schema& s, const bound_view other) const {
+        return kind == other.kind && prefix.equal(s, other.prefix);
+    }
+    bool adjacent(const schema& s, const bound_view other) const {
+        return invert_kind(other.kind) == kind && prefix.equal(s, other.prefix);
+    }
+    static bound_view bottom() {
+        return {empty_prefix, bound_kind::incl_start};
+    }
+    static bound_view top() {
+        return {empty_prefix, bound_kind::incl_end};
+    }
+    /*
+    template<template<typename> typename T, typename U>
+    concept bool Range() {
+        return requires (T<U> range) {
+            { range.start() } -> stdx::optional<U>;
+            { range.end() } -> stdx::optional<U>;
+        };
+    };*/
+    template<template<typename> typename Range>
+    static std::pair<bound_view, bound_view> from_range(const Range<clustering_key_prefix>& range) {
+        return {
+            range.start() ? bound_view(range.start()->value(), range.start()->is_inclusive() ? bound_kind::incl_start : bound_kind::excl_start) : bottom(),
+            range.end() ? bound_view(range.end()->value(), range.end()->is_inclusive() ? bound_kind::incl_end : bound_kind::excl_end) : top(),
+        };
+    }
+    friend std::ostream& operator<<(std::ostream& out, const bound_view& b) {
+        return out << "{bound: prefix=" << b.prefix << ", kind=" << b.kind << "}";
+    }
+};
--- a/clustering_key_filter.cc
+++ b/clustering_key_filter.cc
@@ -28,9 +28,9 @@

 namespace query {

-const std::vector<range<clustering_key_prefix>>&
+const clustering_row_ranges&
 clustering_key_filtering_context::get_ranges(const partition_key& key) const {
-    static thread_local std::vector<range<clustering_key_prefix>> full_range = {{}};
+    static thread_local clustering_row_ranges full_range = {{}};
    return _factory ? _factory->get_ranges(key) : full_range;
 }

@@ -43,9 +43,9 @@ const clustering_key_filtering_context no_clustering_key_filtering =

 class stateless_clustering_key_filter_factory : public clustering_key_filter_factory {
    clustering_key_filter _filter;
-    std::vector<range<clustering_key_prefix>> _ranges;
+    clustering_row_ranges _ranges;
 public:
-    stateless_clustering_key_filter_factory(std::vector<range<clustering_key_prefix>>&& ranges,
+    stateless_clustering_key_filter_factory(clustering_row_ranges&& ranges,
                                    clustering_key_filter&& filter)
        : _filter(std::move(filter)), _ranges(std::move(ranges)) {}

@@ -57,16 +57,20 @@ public:
        return _filter;
    }

-    virtual const std::vector<range<clustering_key_prefix>>& get_ranges(const partition_key& key) override {
+    virtual const clustering_row_ranges& get_ranges(const partition_key& key) override {
        return _ranges;
    }
+
+    virtual bool want_static_columns(const partition_key& key) override {
+        return true;
+    }
 };

 class partition_slice_clustering_key_filter_factory : public clustering_key_filter_factory {
    schema_ptr _schema;
    const partition_slice& _slice;
    clustering_key_prefix::prefix_equal_tri_compare _cmp;
-    query::clustering_row_ranges _ck_ranges;
+    clustering_row_ranges _ck_ranges;
 public:
    partition_slice_clustering_key_filter_factory(schema_ptr s, const partition_slice& slice)
        : _schema(std::move(s)), _slice(slice), _cmp(*_schema) {}
@@ -75,7 +79,7 @@ public:
        const clustering_row_ranges& ranges = _slice.row_ranges(*_schema, key);
        return [this, &ranges] (const clustering_key& key) {
            return std::any_of(std::begin(ranges), std::end(ranges),
-                [this, &key] (const range<clustering_key_prefix>& r) { return r.contains(key, _cmp); });
+                [this, &key] (const clustering_range& r) { return r.contains(key, _cmp); });
        };
    }

@@ -83,11 +87,11 @@ public:
        const clustering_row_ranges& ranges = _slice.row_ranges(*_schema, key);
        return [this, &ranges] (const clustering_key& key) {
            return std::any_of(std::begin(ranges), std::end(ranges),
-                [this, &key] (const range<clustering_key_prefix>& r) { return r.contains(key, _cmp); });
+                [this, &key] (const clustering_range& r) { return r.contains(key, _cmp); });
        };
    }

-    virtual const std::vector<range<clustering_key_prefix>>& get_ranges(const partition_key& key) override {
+    virtual const clustering_row_ranges& get_ranges(const partition_key& key) override {
        if (_slice.options.contains(query::partition_slice::option::reversed)) {
            _ck_ranges = _slice.row_ranges(*_schema, key);
            std::reverse(_ck_ranges.begin(), _ck_ranges.end());
@@ -95,6 +99,10 @@ public:
        }
        return _slice.row_ranges(*_schema, key);
    }
+
+    virtual bool want_static_columns(const partition_key& key) override {
+        return true;
+    }
 };

 static const shared_ptr<clustering_key_filter_factory>
@@ -105,10 +113,10 @@ create_partition_slice_filter(schema_ptr s, const partition_slice& slice) {
 const clustering_key_filtering_context
 clustering_key_filtering_context::create(schema_ptr schema, const partition_slice& slice) {
    static thread_local clustering_key_filtering_context accept_all = clustering_key_filtering_context(
-        ::make_shared<stateless_clustering_key_filter_factory>(std::vector<range<clustering_key_prefix>>{{}},
+        ::make_shared<stateless_clustering_key_filter_factory>(clustering_row_ranges{{}},
                                                       [](const clustering_key&) { return true; }));
    static thread_local clustering_key_filtering_context reject_all = clustering_key_filtering_context(
-        ::make_shared<stateless_clustering_key_filter_factory>(std::vector<range<clustering_key_prefix>>{},
+        ::make_shared<stateless_clustering_key_filter_factory>(clustering_row_ranges{},
                                                       [](const clustering_key&) { return false; }));

    if (slice.get_specific_ranges()) {
--- a/clustering_key_filter.hh
+++ b/clustering_key_filter.hh
@@ -46,6 +46,9 @@ public:
    // Create a clustering key filter that can be used for multiple clustering keys but they have to be sorted.
    virtual clustering_key_filter get_filter_for_sorted(const partition_key&) = 0;
    virtual const std::vector<range<clustering_key_prefix>>& get_ranges(const partition_key&) = 0;
+    // Whether we want to get the static row, in addition to the desired clustering rows
+    virtual bool want_static_columns(const partition_key&) = 0;
+
    virtual ~clustering_key_filter_factory() = default;
 };

@@ -65,6 +68,10 @@ public:
    }
    const std::vector<range<clustering_key_prefix>>& get_ranges(const partition_key& key) const;

+    bool want_static_columns(const partition_key& key)  const {
+        return _factory ? _factory->want_static_columns(key) : true;
+    }
+
    static const clustering_key_filtering_context create(schema_ptr, const partition_slice&);

    static clustering_key_filtering_context create_no_filtering();
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -414,8 +414,12 @@ public:
        return _bytes.empty();
    }

+    static bool is_static(bytes_view bytes, bool is_compound) {
+        return is_compound && bytes.size() > 2 && (bytes[0] & bytes[1] & 0xff) == 0xff;
+    }
+
    bool is_static() const {
-        return size() > 2 && (_bytes.at(0) & _bytes.at(1) & 0xff) == 0xff;
+        return is_static(_bytes, _is_compound);
    }

    bool is_compound() const {
@@ -514,7 +518,7 @@ public:
    }

    bool is_static() const {
-        return size() > 2 && (_bytes.at(0) & _bytes.at(1)) == 0xff;
+        return composite::is_static(_bytes, _is_compound);
    }

    explicit operator bytes_view() const {
--- a/conf/housekeeping.cfg
+++ b/conf/housekeeping.cfg
@@ -0,0 +1,2 @@
+[housekeeping]
+check-version: True
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -784,7 +784,7 @@ commitlog_total_space_in_mb: -1
 # can be:  all  - all traffic is compressed
 #          dc   - traffic between different datacenters is compressed
 #          none - nothing is compressed.
-# internode_compression: all
+# internode_compression: none

 # Enable or disable tcp_nodelay for inter-dc communication.
 # Disabling it will result in larger (but fewer) network packets being sent,
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -35,7 +35,7 @@ class converting_mutation_partition_applier : public mutation_partition_visitor
    deletable_row* _current_row;
 private:
    static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
-        return new_def.kind == kind && new_def.type->is_value_compatible_with(*old_type);
+        return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
    }
    void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
        if (is_compatible(new_def, old_type, kind) && cell.timestamp() > new_def.dropped_at()) {
--- a/cql3/restrictions/multi_column_restriction.hh
+++ b/cql3/restrictions/multi_column_restriction.hh
@@ -394,7 +394,11 @@ public:
            return bounds_range_type::bound(prefix, is_inclusive(b));
        };
        auto range = bounds_range_type(read_bound(statements::bound::START), read_bound(statements::bound::END));
-        return { range };
+        auto bounds = bound_view::from_range(range);
+        if (bound_view::compare(*_schema)(bounds.second, bounds.first)) {
+            return { };
+        }
+        return { std::move(range) };
    }
 #if 0
        @Override
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -46,6 +46,8 @@
 #include "cartesian_product.hh"
 #include "cql3/restrictions/primary_key_restrictions.hh"
 #include "cql3/restrictions/single_column_restrictions.hh"
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/adaptor/filtered.hpp>

 namespace cql3 {

@@ -352,7 +354,14 @@ single_column_primary_key_restrictions<partition_key>::bounds_ranges(const query
 template<>
 std::vector<query::clustering_range>
 single_column_primary_key_restrictions<clustering_key_prefix>::bounds_ranges(const query_options& options) const {
-    auto bounds = compute_bounds(options);
+    auto wrapping_bounds = compute_bounds(options);
+    auto bounds = boost::copy_range<query::clustering_row_ranges>(wrapping_bounds
+            | boost::adaptors::filtered([&](auto&& r) {
+                auto bounds = bound_view::from_range(r);
+                return !bound_view::compare(*_schema)(bounds.second, bounds.first);
+              })
+            | boost::adaptors::transformed([&](auto&& r) { return query::clustering_range(std::move(r));
+    }));
    auto less_cmp = clustering_key_prefix::less_compare(*_schema);
    std::sort(bounds.begin(), bounds.end(), [&] (query::clustering_range& x, query::clustering_range& y) {
        if (!x.start() && !y.start()) {
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -92,9 +92,9 @@ void cf_prop_defs::validate() {
            throw exceptions::configuration_exception(sstring("Missing sub-option '") + COMPACTION_STRATEGY_CLASS_KEY + "' for the '" + KW_COMPACTION + "' option.");
        }
        _compaction_strategy_class = sstables::compaction_strategy::type(strategy->second);
-#if 0
-       compactionOptions.remove(COMPACTION_STRATEGY_CLASS_KEY);
+        remove_from_map_if_exists(KW_COMPACTION, COMPACTION_STRATEGY_CLASS_KEY);

+#if 0
       CFMetaData.validateCompactionOptions(compactionStrategyClass, compactionOptions);
 #endif
    }
--- a/cql3/statements/property_definitions.cc
+++ b/cql3/statements/property_definitions.cc
@@ -181,6 +181,21 @@ long property_definitions::to_long(sstring key, std::experimental::optional<sstr
    }
 }

+void property_definitions::remove_from_map_if_exists(const sstring& name, const sstring& key)
+{
+    auto it = _properties.find(name);
+    if (it == _properties.end()) {
+        return;
+    }
+    try {
+        auto map = boost::any_cast<std::map<sstring, sstring>>(it->second);
+        map.erase(key);
+        _properties[name] = map;
+    } catch (const boost::bad_any_cast& e) {
+        throw exceptions::syntax_exception(sprint("Invalid value for property '%s'. It should be a map.", name));
+    }
+}
+
 }

 }
--- a/cql3/statements/property_definitions.hh
+++ b/cql3/statements/property_definitions.hh
@@ -79,6 +79,7 @@ protected:

    std::experimental::optional<std::map<sstring, sstring>> get_map(const sstring& name) const;

+    void remove_from_map_if_exists(const sstring& name, const sstring& key);
 public:
    bool has_property(const sstring& name) const;

--- a/database.cc
+++ b/database.cc
@@ -127,7 +127,7 @@ column_family::column_family(schema_ptr schema, config config, db::commitlog* cl
    , _streaming_memtables(_config.enable_disk_writes ? make_streaming_memtable_list() : make_memory_only_memtable_list())
    , _compaction_strategy(make_compaction_strategy(_schema->compaction_strategy(), _schema->compaction_strategy_options()))
    , _sstables(make_lw_shared(_compaction_strategy.make_sstable_set(_schema)))
-    , _cache(_schema, sstables_as_mutation_source(), sstables_as_key_source(), global_cache_tracker())
+    , _cache(_schema, sstables_as_mutation_source(), sstables_as_key_source(), global_cache_tracker(), _config.max_cached_partition_size_in_bytes)
    , _commitlog(cl)
    , _compaction_manager(compaction_manager)
    , _flush_queue(std::make_unique<memtable_flush_queue>())
@@ -278,8 +278,14 @@ column_family::make_sstable_reader(schema_ptr s,
                                   const io_priority_class& pc) const {
    // restricts a reader's concurrency if the configuration specifies it
    auto restrict_reader = [&] (mutation_reader&& in) {
-        if (_config.read_concurrency_config.sem) {
-            return make_restricted_reader(_config.read_concurrency_config, 1, std::move(in));
+        auto&& config = [this, &pc] () -> const restricted_mutation_reader_config& {
+            if (service::get_local_streaming_read_priority() == pc) {
+                return _config.streaming_read_concurrency_config;
+            }
+            return _config.read_concurrency_config;
+        }();
+        if (config.sem) {
+            return make_restricted_reader(config, 1, std::move(in));
        } else {
            return std::move(in);
        }
@@ -785,7 +791,7 @@ future<> column_family::seal_active_streaming_memtable_big(streaming_memtable_bi
 future<>
 column_family::seal_active_memtable(memtable_list::flush_behavior ignored) {
    auto old = _memtables->back();
-    dblog.debug("Sealing active memtable, partitions: {}, occupancy: {}", old->partition_count(), old->occupancy());
+    dblog.debug("Sealing active memtable of {}.{}, partitions: {}, occupancy: {}", _schema->cf_name(), _schema->ks_name(), old->partition_count(), old->occupancy());

    if (old->empty()) {
        dblog.debug("Memtable is empty");
@@ -1206,6 +1212,19 @@ size_t column_family::sstables_count() const {
    return _sstables->all()->size();
 }

+std::vector<uint64_t> column_family::sstable_count_per_level() const {
+    std::vector<uint64_t> count_per_level;
+    for (auto&& sst : *_sstables->all()) {
+        auto level = sst->get_sstable_level();
+
+        if (level + 1 > count_per_level.size()) {
+            count_per_level.resize(level + 1, 0UL);
+        }
+        count_per_level[level]++;
+    }
+    return count_per_level;
+}
+
 int64_t column_family::get_unleveled_sstables() const {
    // TODO: when we support leveled compaction, we should return the number of
    // SSTables in L0. If leveled compaction is enabled in this column family,
@@ -1581,7 +1600,7 @@ future<> database::parse_system_tables(distributed<service::storage_proxy>& prox
                return parallel_for_each(tables.begin(), tables.end(), [this] (auto& t) {
                    auto s = t.second;
                    auto& ks = this->find_keyspace(s->ks_name());
-                    auto cfg = ks.make_column_family_config(*s);
+                    auto cfg = ks.make_column_family_config(*s, this->get_config());
                    this->add_column_family(s, std::move(cfg));
                    return ks.make_directory_for_column_family(s->cf_name(), s->id()).then([s] {});
                });
@@ -1838,7 +1857,7 @@ void keyspace::update_from(::lw_shared_ptr<keyspace_metadata> ksm) {
 }

 column_family::config
-keyspace::make_column_family_config(const schema& s) const {
+keyspace::make_column_family_config(const schema& s, const db::config& db_config) const {
    column_family::config cfg;
    cfg.datadir = column_family_directory(s.cf_name(), s.id());
    cfg.enable_disk_reads = _config.enable_disk_reads;
@@ -1850,8 +1869,10 @@ keyspace::make_column_family_config(const schema& s) const {
    cfg.dirty_memory_manager = _config.dirty_memory_manager;
    cfg.streaming_dirty_memory_manager = _config.streaming_dirty_memory_manager;
    cfg.read_concurrency_config = _config.read_concurrency_config;
+    cfg.streaming_read_concurrency_config = _config.streaming_read_concurrency_config;
    cfg.cf_stats = _config.cf_stats;
    cfg.enable_incremental_backups = _config.enable_incremental_backups;
+    cfg.max_cached_partition_size_in_bytes = db_config.max_cached_partition_size_in_kb() * 1024;

    return cfg;
 }
@@ -2081,7 +2102,7 @@ std::unordered_set<sstring> database::get_initial_tokens() {
    std::unordered_set<sstring> tokens;
    sstring tokens_string = get_config().initial_token();
    try {
-        boost::split(tokens, tokens_string, boost::is_any_of(sstring(",")));
+        boost::split(tokens, tokens_string, boost::is_any_of(sstring(", ")));
    } catch (...) {
        throw std::runtime_error(sprint("Unable to parse initial_token=%s", tokens_string));
    }
@@ -2338,6 +2359,8 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        ++_stats->sstable_read_queue_overloaded;
        throw std::runtime_error("sstable inactive read queue overloaded");
    };
+    cfg.streaming_read_concurrency_config = cfg.read_concurrency_config;
+    cfg.streaming_read_concurrency_config.timeout = {};
    cfg.cf_stats = &_cf_stats;
    cfg.enable_incremental_backups = _enable_incremental_backups;
    return cfg;
@@ -2502,6 +2525,7 @@ future<> update_schema_version_and_announce(distributed<service::storage_proxy>&
            return make_ready_future<>();
        }).then([uuid] {
            return db::system_keyspace::update_schema_version(uuid).then([uuid] {
+                dblog.info("Schema version changed to {}", uuid);
                return service::get_local_migration_manager().passive_announce(uuid);
            });
        });
--- a/database.hh
+++ b/database.hh
@@ -315,7 +315,9 @@ public:
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
        restricted_mutation_reader_config read_concurrency_config;
+        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
+        uint64_t max_cached_partition_size_in_bytes;
    };
    struct no_commitlog {};
    struct stats {
@@ -652,6 +654,7 @@ public:
    lw_shared_ptr<sstable_list> get_sstables_including_compacted_undeleted() const;
    std::vector<sstables::shared_sstable> select_sstables(const query::partition_range& range) const;
    size_t sstables_count() const;
+    std::vector<uint64_t> sstable_count_per_level() const;
    int64_t get_unleveled_sstables() const;

    void start_compaction();
@@ -853,6 +856,7 @@ public:
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
        restricted_mutation_reader_config read_concurrency_config;
+        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
    };
 private:
@@ -884,7 +888,7 @@ public:
     */
    locator::abstract_replication_strategy& get_replication_strategy();
    const locator::abstract_replication_strategy& get_replication_strategy() const;
-    column_family::config make_column_family_config(const schema& s) const;
+    column_family::config make_column_family_config(const schema& s, const db::config& db_config) const;
    future<> make_directory_for_column_family(const sstring& name, utils::UUID uuid);
    void add_or_update_column_family(const schema_ptr& s) {
        _metadata->add_or_update_column_family(s);
--- a/db/config.hh
+++ b/db/config.hh
@@ -369,6 +369,9 @@ public:
    val(reduce_cache_sizes_at, double, .85, Invalid,     \
            "When Java heap usage (after a full concurrent mark sweep (CMS) garbage collection) exceeds this percentage, Cassandra reduces the cache capacity to the fraction of the current size as specified by reduce_cache_capacity_to. To disable, set the value to 1.0."  \
    )   \
+    val(max_cached_partition_size_in_kb, uint64_t, 10240uLL, Used,     \
+            "Partitions with size greater than this value won't be cached."  \
+    )   \
    /* Disks settings */    \
    val(stream_throughput_outbound_megabits_per_sec, uint32_t, 400, Unused,     \
            "Throttles all outbound streaming file transfers on a node to the specified throughput. Cassandra does mostly sequential I/O when streaming data during bootstrap or repair, which can lead to saturating the network connection and degrading client (RPC) performance."  \
@@ -556,7 +559,7 @@ public:
    val(rpc_port, uint16_t, 9160, Used,                \
            "Thrift port for client connections."  \
    )   \
-    val(start_rpc, bool, false, Used,                \
+    val(start_rpc, bool, true, Used,                \
            "Starts the Thrift RPC server"  \
    )   \
    val(rpc_keepalive, bool, true, Used,     \
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -665,13 +665,16 @@ future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& p
    auto diff = difference(before, after, indirect_equal_to<lw_shared_ptr<query::result_set>>());

    for (auto&& key : diff.entries_only_on_left) {
+        logger.info("Dropping keyspace {}", key);
        dropped.emplace(key);
    }
    for (auto&& key : diff.entries_only_on_right) {
        auto&& value = after[key];
+        logger.info("Creating keyspace {}", key);
        created.emplace_back(schema_result_value_type{key, std::move(value)});
    }
    for (auto&& key : diff.entries_differing) {
+        logger.info("Altering keyspace {}", key);
        altered.emplace_back(key);
    }
    return do_with(std::move(created), [&proxy, altered = std::move(altered)] (auto& created) mutable {
@@ -713,15 +716,21 @@ static void merge_tables(distributed<service::storage_proxy>& proxy,
    std::map<qualified_name, schema_mutations>&& before,
    std::map<qualified_name, schema_mutations>&& after)
 {
+    struct dropped_table {
+        global_schema_ptr schema;
+        utils::joinpoint<db_clock::time_point> jp{[] {
+            return make_ready_future<db_clock::time_point>(db_clock::now());
+        }};
+    };
    std::vector<global_schema_ptr> created;
    std::vector<global_schema_ptr> altered;
-    std::vector<global_schema_ptr> dropped;
+    std::vector<dropped_table> dropped;

    auto diff = difference(before, after);
    for (auto&& key : diff.entries_only_on_left) {
        auto&& s = proxy.local().get_db().local().find_schema(key.keyspace_name, key.table_name);
        logger.info("Dropping {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
-        dropped.emplace_back(s);
+        dropped.emplace_back(dropped_table{s});
    }
    for (auto&& key : diff.entries_only_on_right) {
        auto s = create_table_from_mutations(after.at(key));
@@ -734,14 +743,12 @@ static void merge_tables(distributed<service::storage_proxy>& proxy,
        altered.emplace_back(s);
    }

-    do_with(utils::make_joinpoint([] { return db_clock::now();})
-        , [&created, &dropped, &altered, &proxy](auto& tsf) {
-        return proxy.local().get_db().invoke_on_all([&created, &dropped, &altered, &tsf] (database& db) {
+    proxy.local().get_db().invoke_on_all([&created, &dropped, &altered] (database& db) {
            return seastar::async([&] {
                for (auto&& gs : created) {
                    schema_ptr s = gs.get();
                    auto& ks = db.find_keyspace(s->ks_name());
-                    auto cfg = ks.make_column_family_config(*s);
+                    auto cfg = ks.make_column_family_config(*s, db.get_config());
                    db.add_column_family(s, cfg);
                    auto& cf = db.find_column_family(s);
                    cf.mark_ready_for_writes();
@@ -751,14 +758,13 @@ static void merge_tables(distributed<service::storage_proxy>& proxy,
                for (auto&& gs : altered) {
                    update_column_family(db, gs.get()).get();
                }
-                parallel_for_each(dropped.begin(), dropped.end(), [&db, &tsf](auto&& gs) {
-                    schema_ptr s = gs.get();
-                    return db.drop_column_family(s->ks_name(), s->cf_name(), [&tsf] { return tsf.value(); }).then([s] {
+                parallel_for_each(dropped.begin(), dropped.end(), [&db](dropped_table& dt) {
+                    schema_ptr s = dt.schema.get();
+                    return db.drop_column_family(s->ks_name(), s->cf_name(), [&dt] { return dt.jp.value(); }).then([s] {
                        return service::get_local_migration_manager().notify_drop_column_family(s);
                    });
                }).get();
            });
-        });
    }).get();
 }

--- a/db/size_estimates_recorder.cc
+++ b/db/size_estimates_recorder.cc
@@ -71,14 +71,30 @@ static std::vector<db::system_keyspace::range_estimates> estimates_for(const col
    std::vector<db::system_keyspace::range_estimates> estimates;
    estimates.reserve(local_ranges.size());

+    std::vector<query::partition_range> unwrapped;
+    // Each range defines both bounds.
    for (auto& range : local_ranges) {
        int64_t count{0};
        sstables::estimated_histogram hist{0};
-        for (auto&& sstable : cf.select_sstables(range)) {
+        unwrapped.clear();
+        if (range.is_wrap_around(dht::ring_position_comparator(*cf.schema()))) {
+            auto uw = range.unwrap();
+            unwrapped.push_back(std::move(uw.first));
+            unwrapped.push_back(std::move(uw.second));
+        } else {
+            unwrapped.push_back(range);
+        }
+        for (auto&& uwr : unwrapped) {
+          for (auto&& sstable : cf.select_sstables(uwr)) {
            count += sstable->get_estimated_key_count();
            hist.merge(sstable->get_stats_metadata().estimated_row_size);
+          }
        }
-        estimates.emplace_back(&range, db::system_keyspace::partition_estimates{count, count > 0 ? hist.mean() : 0});
+        estimates.emplace_back(db::system_keyspace::range_estimates{
+                range.start()->value().token(),
+                range.end()->value().token(),
+                count,
+                count > 0 ? hist.mean() : 0});
    }

    return estimates;
@@ -130,7 +146,7 @@ future<> size_estimates_recorder::record_size_estimates() {
 }

 future<> size_estimates_recorder::stop() {
-    if (get_size_estimates_recorder().local_is_initialized()) {
+    if (engine().cpu_id() == 0) {
        service::get_local_migration_manager().unregister_listener(this);
        _timer.cancel();
        return _gate.close();
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1043,7 +1043,7 @@ void make(database& db, bool durable, bool volatile_testing_only) {
    db.add_keyspace(NAME, std::move(_ks));
    auto& ks = db.find_keyspace(NAME);
    for (auto&& table : all_tables()) {
-        db.add_column_family(table, ks.make_column_family_config(*table));
+        db.add_column_family(table, ks.make_column_family_config(*table, db.get_config()));
    }
 }

@@ -1195,10 +1195,10 @@ future<int> increment_and_get_generation() {
    });
 }

-future<> update_size_estimates(const sstring& ks_name, const sstring& cf_name, std::vector<range_estimates> estimates) {
+future<> update_size_estimates(sstring ks_name, sstring cf_name, std::vector<range_estimates> estimates) {
    auto&& schema = size_estimates();
    auto timestamp = api::new_timestamp();
-    mutation m_to_apply{partition_key::from_singular(*schema, ks_name), schema};
+    mutation m_to_apply{partition_key::from_single_value(*schema, to_bytes(ks_name)), schema};

    // delete all previous values with a single range tombstone.
    auto ck = clustering_key_prefix::from_single_value(*schema, utf8_type->decompose(cf_name));
@@ -1206,28 +1206,48 @@ future<> update_size_estimates(const sstring& ks_name, const sstring& cf_name, s

    // add a CQL row for each primary token range.
    for (auto&& e : estimates) {
-        // This range has both start and end bounds. We're only interested in the tokens.
-        const range<dht::ring_position>* range = e.first;
        auto ck = clustering_key_prefix(std::vector<bytes>{
                     utf8_type->decompose(cf_name),
-                     utf8_type->decompose(dht::global_partitioner().to_sstring(range->start()->value().token())),
-                     utf8_type->decompose(dht::global_partitioner().to_sstring(range->end()->value().token()))});
+                     utf8_type->decompose(dht::global_partitioner().to_sstring(e.range_start_token)),
+                     utf8_type->decompose(dht::global_partitioner().to_sstring(e.range_end_token))});

        auto mean_partition_size_col = schema->get_column_definition("mean_partition_size");
-        auto cell = atomic_cell::make_live(timestamp, long_type->decompose(e.second.mean_partition_size), { });
+        auto cell = atomic_cell::make_live(timestamp, long_type->decompose(e.mean_partition_size), { });
        m_to_apply.set_clustered_cell(ck, *mean_partition_size_col, std::move(cell));

        auto partitions_count_col = schema->get_column_definition("partitions_count");
-        cell = atomic_cell::make_live(timestamp, long_type->decompose(e.second.partitions_count), { });
+        cell = atomic_cell::make_live(timestamp, long_type->decompose(e.partitions_count), { });
        m_to_apply.set_clustered_cell(std::move(ck), *partitions_count_col, std::move(cell));
    }

    return service::get_local_storage_proxy().mutate_locally(std::move(m_to_apply));
 }

-future<> clear_size_estimates(const sstring& ks_name, const sstring& cf_name) {
+future<> clear_size_estimates(sstring ks_name, sstring cf_name) {
    sstring req = "DELETE FROM system.%s WHERE keyspace_name = ? AND table_name = ?";
-    return execute_cql(req, SIZE_ESTIMATES, ks_name, cf_name).discard_result();
+    return execute_cql(std::move(req), SIZE_ESTIMATES, std::move(ks_name), std::move(cf_name)).discard_result();
+}
+
+future<std::vector<range_estimates>> query_size_estimates(sstring ks_name, sstring cf_name, dht::token start_token, dht::token end_token) {
+    sstring req = "SELECT range_start, range_end, partitions_count, mean_partition_size FROM system.%s WHERE keyspace_name = ? AND table_name = ?";
+    auto query_range = range<dht::token>::make({std::move(start_token)}, {std::move(end_token)});
+    return execute_cql(req, SIZE_ESTIMATES, std::move(ks_name), std::move(cf_name))
+            .then([query_range = std::move(query_range)](::shared_ptr<cql3::untyped_result_set> result) {
+        std::vector<range_estimates> estimates;
+        for (auto&& row : *result) {
+            auto range_start = dht::global_partitioner().from_sstring(row.get_as<sstring>("range_start"));
+            auto range_end = dht::global_partitioner().from_sstring(row.get_as<sstring>("range_end"));
+            auto estimate_range = range<dht::token>::make({std::move(range_start)}, {std::move(range_end)});
+            if (query_range.contains(estimate_range, &dht::tri_compare)) {
+                estimates.emplace_back(range_estimates{
+                    std::move(*estimate_range.start()).value(),
+                    std::move(*estimate_range.end()).value(),
+                    row.get_as<int64_t>("partitions_count"),
+                    row.get_as<int64_t>("mean_partition_size")});
+            }
+        }
+        return estimates;
+    });
 }

 } // namespace system_keyspace
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -80,13 +80,13 @@ static constexpr auto SSTABLE_ACTIVITY = "sstable_activity";
 static constexpr auto SIZE_ESTIMATES = "size_estimates";

 // Partition estimates for a given range of tokens.
-struct partition_estimates {
+struct range_estimates {
+    dht::token range_start_token;
+    dht::token range_end_token;
    int64_t partitions_count;
    int64_t mean_partition_size;
 };

-using range_estimates = std::pair<const range<dht::ring_position>*, partition_estimates>;
-
 extern schema_ptr hints();
 extern schema_ptr batchlog();
 extern schema_ptr built_indexes(); // TODO (from Cassandra): make private
@@ -572,12 +572,17 @@ future<> set_bootstrap_state(bootstrap_state state);
 /**
 * Writes the current partition count and size estimates into SIZE_ESTIMATES_CF
 */
-future<> update_size_estimates(const sstring& ks_name, const sstring& cf_name, std::vector<range_estimates> estimates);
+future<> update_size_estimates(sstring ks_name, sstring cf_name, std::vector<range_estimates> estimates);

 /**
 * Clears size estimates for a table (on table drop)
 */
-future<> clear_size_estimates(const sstring& ks_name, const sstring& cf_name);
+future<> clear_size_estimates(sstring ks_name, sstring cf_name);
+
+/**
+ * Queries the size estimates within the specified range
+ */
+future<std::vector<range_estimates>> query_size_estimates(sstring ks_name, sstring cf_name, dht::token start_token, dht::token end_token);

 } // namespace system_keyspace
 } // namespace db
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -8,7 +8,7 @@ fi
 print_usage() {
    echo "build_ami.sh --localrpm --repo [URL]"
    echo "  --localrpm  deploy locally built rpms"
-    echo "  --repo  specify repository URL"
+    echo "  --repo  specify .repo/.list file URL"
    exit 1
 }
 LOCALRPM=0
@@ -16,7 +16,8 @@ while [ $# -gt 0 ]; do
    case "$1" in
        "--localrpm")
            LOCALRPM=1
-            INSTALL_ARGS="$INSTALL_ARGS --localrpm"
+            REPO=`./scripts/scylla_current_repo`
+            INSTALL_ARGS="$INSTALL_ARGS --localrpm --repo $REPO"
            shift 1
            ;;
        "--repo")
@@ -52,10 +53,13 @@ if [ $LOCALRPM -eq 1 ]; then
    if [ "$ID" = "centos" ]; then
        rm -rf build/*
        sudo yum -y install git
-        if [ ! -f dist/ami/files/scylla-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-server.x86_64.rpm ]; then
+        if [ ! -f dist/ami/files/scylla.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-kernel-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-server.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-debuginfo.x86_64.rpm ]; then
            dist/redhat/build_rpm.sh
+            cp build/rpmbuild/RPMS/x86_64/scylla-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla.x86_64.rpm
+            cp build/rpmbuild/RPMS/x86_64/scylla-kernel-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-kernel-conf.x86_64.rpm
            cp build/rpmbuild/RPMS/x86_64/scylla-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-conf.x86_64.rpm
            cp build/rpmbuild/RPMS/x86_64/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
+            cp build/rpmbuild/RPMS/x86_64/scylla-debuginfo-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-debuginfo.x86_64.rpm
        fi
        if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
            cd build
@@ -80,8 +84,11 @@ if [ $LOCALRPM -eq 1 ]; then
                echo "Build .deb before running build_ami.sh"
                exit 1
            fi
+            cp ../scylla_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla_amd64.deb
+            cp ../scylla-kernel-conf_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-kernel-conf_amd64.deb
            cp ../scylla-conf_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-conf_amd64.deb
            cp ../scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-server_amd64.deb
+            cp ../scylla-server-dbg_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-server-dbg_amd64.deb
        fi
        if [ ! -f dist/ami/files/scylla-jmx_all.deb ]; then
            cd build
--- a/dist/ami/files/.bash_profile
+++ b/dist/ami/files/.bash_profile
@@ -11,6 +11,14 @@ PATH=$PATH:$HOME/.local/bin:$HOME/bin

 export PATH

+is_supported_instance_type() {
+	TYPE=`curl -s http://169.254.169.254/latest/meta-data/instance-type|cut -d . -f 1`
+	case $TYPE in
+		"m3"|"c3"|"i2") echo 1;;
+		*) echo 0;;
+	esac
+}
+
 echo
 echo '   _____            _ _       _____  ____  '
 echo '  / ____|          | | |     |  __ \|  _ \ '
@@ -28,9 +36,11 @@ echo 'CQL Shell:'
 echo '	cqlsh'
 echo 'More documentation available at: '
 echo '	http://www.scylladb.com/doc/'
+echo 'By default, Scylla sends certain information about this node to a data collection server. For information, see http://www.scylladb.com/privacy/'
 echo

 . /etc/os-release
+
 SETUP=0
 if [ "$ID" != "ubuntu" ]; then
 	if [ "`systemctl status scylla-ami-setup|grep Active|grep exited`" = "" ]; then
@@ -71,19 +81,34 @@ else
 		tput sgr0
 		echo
 	else
-		tput setaf 1
-		tput bold
-		echo "    ScyllaDB is not started!"
-		tput sgr0
-		echo "Please wait for startup. To see status of ScyllaDB, run "
-		if [ "$ID" = "ubuntu" ]; then
-			echo " 'initctl status scylla-server'"
-			echo "and"
-			echo " 'sudo cat /var/log/upstart/scylla-server.log'"
-			echo
+		if [ `is_supported_instance_type` -eq 0 ]; then
+			TYPE=`curl -s http://169.254.169.254/latest/meta-data/instance-type`
+			tput setaf 1
+			tput bold
+			echo "    $TYPE is not supported instance type!"
+			tput sgr0
+			echo -n "To continue startup ScyllaDB on this instance, run 'sudo scylla_io_setup' "
+			if [ "$ID" = "ubuntu" ]; then
+				echo "then 'initctl start scylla-server'."
+			else
+				echo "then 'systemctl start scylla-server'."
+			fi
+			echo "To run ScyllaDB on supported instance type, run AMI in m3/c3/i2 types."
 		else
-			echo " 'systemctl status scylla-server'"
-			echo
+			tput setaf 1
+			tput bold
+			echo "    ScyllaDB is not started!"
+			tput sgr0
+			echo "Please wait for startup. To see status of ScyllaDB, run "
+			if [ "$ID" = "ubuntu" ]; then
+				echo " 'initctl status scylla-server'"
+				echo "and"
+				echo " 'sudo cat /var/log/upstart/scylla-server.log'"
+				echo
+			else
+				echo " 'systemctl status scylla-server'"
+				echo
+			fi
 		fi
 	fi
 fi
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -5,16 +5,22 @@
 print_usage() {
    echo "scylla_coredump_setup --dump-to-raiddir"
    echo "  --dump-to-raiddir  store coredump to /var/lib/scylla"
+    echo "  --compress  enable compress on systemd-coredump"
    exit 1
 }

 SYMLINK=0
+COMPRESS=no
 while [ $# -gt 0 ]; do
    case "$1" in
        "--dump-to-raiddir")
            SYMLINK=1
            shift 1
            ;;
+        "--compress")
+            COMPRESS=yes
+            shift 1
+            ;;
        *)
            print_usage
            ;;
@@ -33,12 +39,13 @@ else
    cat << EOS > /etc/systemd/coredump.conf
 [Coredump]
 Storage=external
-Compress=yes
+Compress=$COMPRESS
 ProcessSizeMax=1024G
 ExternalSizeMax=1024G
 EOS
    if [ $SYMLINK = 1 ]; then
        rm -rf /var/lib/systemd/coredump
+        mkdir -p /var/lib/scylla/coredump
        ln -sf /var/lib/scylla/coredump /var/lib/systemd/coredump
    fi
    systemctl daemon-reload
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -33,13 +33,7 @@ elif [ "$NETWORK_MODE" = "dpdk" ]; then
    done
 else # NETWORK_MODE = posix
    if [ "$SET_NIC" = "yes" ]; then
-        NRXQ=`find /sys/class/net/$IFNAME/queues -name "rx-*"|wc -l`
-        if [ $NRXQ -gt 1 ]; then
-            CONF_ARGS=-mq
-        else
-            CONF_ARGS=-sq
-        fi
-        /usr/lib/scylla/posix_net_conf.sh $IFNAME $CONF_ARGS
+        /usr/lib/scylla/posix_net_conf.sh $IFNAME
    fi
 fi
 if [ "$ID" = "ubuntu" ]; then
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -205,20 +205,30 @@ fi
 if [ $INTERACTIVE -eq 1 ]; then
    interactive_ask_service "Do you want to enable ScyllaDB services?" "Answer yes to automatically start Scylla when the node boots; answer no to skip this step." "yes" &&:
    ENABLE_SERVICE=$?
-    if [ $ENABLE_SERVICE -eq 1 ]; then
+    if [ $ENABLE_SERVICE -eq 1 ] && [ ! -f /etc/scylla.d/housekeeping.cfg ]; then
        interactive_ask_service "Do you want to enable ScyllaDB version check?" "Answer yes to automatically start Scylla-housekeeping that check for newer version, when the node boots; answer no to skip this step." "yes" &&:
        ENABLE_CHECK_VERSION=$?
    fi
 fi

 if [ $ENABLE_SERVICE -eq 1 ]; then
-    if [ "$ID" = "fedora" ] || [ "$ID" = "centos" ]; then
+    if [ "$ID" = "fedora" ] || [ "$ID" = "centos" ] || [ "$ID" = "ubuntu" -a "$VERSION_ID" != "14.04" ]; then
        systemctl enable scylla-server.service
        systemctl enable collectd.service
        if [ $ENABLE_CHECK_VERSION -eq 1 ]; then
            systemctl unmask scylla-housekeeping.timer
        else
            systemctl mask scylla-housekeeping.timer
+            systemctl stop scylla-housekeeping.timer || true
+        fi
+    fi
+    if [ $ENABLE_CHECK_VERSION -eq 1 ]; then
+        if [ ! -f /etc/scylla.d/housekeeping.cfg ]; then
+           printf "[housekeeping]\ncheck-version: True\n" > /etc/scylla.d/housekeeping.cfg
+        fi
+    else
+        if [ ! -f /etc/scylla.d/housekeeping.cfg ]; then
+           printf "[housekeeping]\ncheck-version: False\n" > /etc/scylla.d/housekeeping.cfg
        fi
    fi
 fi
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -73,6 +73,24 @@ done

 echo Setting parameters on $SYSCONFIG/scylla-server

+if [ $SET_NIC = "yes" ]; then
+   RPS_CPUS=$(/usr/lib/scylla/posix_net_conf.sh --cpu-mask $NIC)
+   RPS_CPUS=${RPS_CPUS//0x}
+   RPS_CPUS=${RPS_CPUS//,}
+   if [ "$RPS_CPUS" != "" ]; then
+       BITS=$(echo "obase=2;ibase=16;${RPS_CPUS~~}"|bc|rev)
+       for ((i=0; i < ${#BITS}; i++)); do
+           if [ ${BITS:$i:1} -eq 1 ]; then
+               CPUSET="$CPUSET$i"
+               if [ $i -lt $((${#BITS} - 1)) ]; then
+                   CPUSET="$CPUSET,"
+               fi
+           fi
+       done
+       /usr/lib/scylla/scylla_cpuset_setup --cpuset $CPUSET
+   fi
+fi
+
 ETHDRV=`/usr/lib/scylla/dpdk_nic_bind.py --status | grep if=$NIC | sed -e "s/^.*drv=//" -e "s/ .*$//"`
 ETHPCIID=`/usr/lib/scylla/dpdk_nic_bind.py --status | grep if=$NIC | awk '{print $1}'`
 sed -e s#^NETWORK_MODE=.*#NETWORK_MODE=$NETWORK_MODE# \
--- a/dist/common/systemd/scylla-housekeeping.service.in
+++ b/dist/common/systemd/scylla-housekeeping.service.in
@@ -1,11 +1,12 @@
 [Unit]
 Description=Scylla Housekeeping
+After=network.target

 [Service]
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/lib/scylla/scylla-Housekeeping -q version
+ExecStart=/usr/lib/scylla/scylla-housekeeping -q -c /etc/scylla.d/housekeeping.cfg version

 [Install]
 WantedBy=multi-user.target
--- a/dist/common/systemd/scylla-housekeeping.timer.in
+++ b/dist/common/systemd/scylla-housekeeping.timer.in
@@ -4,6 +4,7 @@ After=scylla-server.service
 BindsTo=scylla-server.service

 [Timer]
+OnBootSec=0
 OnUnitActiveSec=1d

 [Install]
--- a/dist/common/systemd/scylla-server.service.in
+++ b/dist/common/systemd/scylla-server.service.in
@@ -1,5 +1,6 @@
 [Unit]
 Description=Scylla Server
+After=network.target
 Wants=scylla-jmx.service
 Wants=scylla-housekeeping.timer

@@ -12,7 +13,6 @@ LimitAS=infinity
 LimitNPROC=8096
 EnvironmentFile=@@SYSCONFDIR@@/scylla-server
 EnvironmentFile=/etc/scylla.d/*.conf
-WorkingDirectory=$SCYLLA_HOME
 ExecStartPre=/usr/lib/scylla/scylla_prepare
 ExecStart=/usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET
 ExecStopPost=/usr/lib/scylla/scylla_stop
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -2,24 +2,42 @@ FROM centos:7

 MAINTAINER Avi Kivity <avi@cloudius-systems.com>

-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
+#enable systemd
+ENV container docker
+VOLUME [ "/sys/fs/cgroup" ]
+
+#install scylla
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-1.3.repo -o /etc/yum.repos.d/scylla.repo
 RUN yum -y install epel-release
 RUN yum -y clean expire-cache
 RUN yum -y update
 RUN yum -y remove boost-thread boost-system
-RUN yum -y install scylla hostname
+RUN yum -y install scylla hostname supervisor
 RUN yum clean all

-ADD start-scylla /start-scylla
-RUN chown scylla /start-scylla
+#install python3 for our main script
+RUN yum -y install python34 python34-PyYAML

-ADD bashrc /var/lib/scylla/.bashrc
-RUN chown scylla /var/lib/scylla/.bashrc
-RUN chown -R scylla:scylla /etc/scylla
-RUN chown -R scylla:scylla /etc/scylla.d
+ADD scylla_bashrc /scylla_bashrc
+RUN cat /scylla_bashrc >> /etc/bashrc
+
+# Scylla configuration:
+ADD etc/sysconfig/scylla-server /etc/sysconfig/scylla-server
+
+# Supervisord configuration:
+ADD etc/supervisord.conf /etc/supervisord.conf
+RUN mkdir -p /etc/supervisor.conf.d
+ADD etc/supervisord.conf.d/scylla-server.conf /etc/supervisord.conf.d/scylla-server.conf
+ADD etc/supervisord.conf.d/scylla-jmx.conf /etc/supervisord.conf.d/scylla-jmx.conf
+RUN mkdir -p /var/log/scylla
+ADD scylla-service.sh /scylla-service.sh
+ADD scylla-jmx-service.sh /scylla-jmx-service.sh
+
+ADD scyllasetup.py /scyllasetup.py
+ADD commandlineparser.py /commandlineparser.py
+ADD docker-entrypoint.py /docker-entrypoint.py
+ENTRYPOINT ["/docker-entrypoint.py"]

-USER scylla
 EXPOSE 10000 9042 9160 7000 7001
-VOLUME /var/lib/scylla
-
-CMD /start-scylla && /bin/bash
+VOLUME [ "/var/lib/scylla" ]
+RUN chown -R scylla.scylla /var/lib/scylla
--- a/dist/docker/redhat/bashrc
+++ b/dist/docker/redhat/bashrc
@@ -1,20 +0,0 @@
-echo
-echo '   _____            _ _       _____  ____  '
-echo '  / ____|          | | |     |  __ \|  _ \ '
-echo ' | (___   ___ _   _| | | __ _| |  | | |_) |'
-echo '  \___ \ / __| | | | | |/ _` | |  | |  _ < '
-echo '  ____) | (__| |_| | | | (_| | |__| | |_) |'
-echo ' |_____/ \___|\__, |_|_|\__,_|_____/|____/ '
-echo '               __/ |                       '
-echo '              |___/                        '
-echo ''
-echo ''
-echo 'Nodetool:'
-echo '	nodetool help'
-echo 'CQL Shell:'
-echo '	cqlsh'
-echo 'More documentation available at: '
-echo '	http://www.scylladb.com/doc/'
-echo
-
-export CQLSH_HOST=$(hostname -i)
--- a/dist/docker/redhat/commandlineparser.py
+++ b/dist/docker/redhat/commandlineparser.py
@@ -0,0 +1,14 @@
+import argparse
+
+
+def parse():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--developer-mode', default='1', choices=['0', '1'], dest='developerMode')
+    parser.add_argument('--seeds', default=None, help="specify seeds - if left empty will use container's own IP")
+    parser.add_argument('--cpuset', default=None, help="e.g. --cpuset 0-3 for the first four CPUs")
+    parser.add_argument('--smp', default=None, help="e.g --smp 2 to use two CPUs")
+    parser.add_argument('--memory', default=None, help="e.g. --memory 1G to use 1 GB of RAM")
+    parser.add_argument('--overprovisioned', default='0', choices=['0', '1'], help="run in overprovisioned environment")
+    parser.add_argument('--broadcast-address', default=None, dest='broadcastAddress')
+    parser.add_argument('--broadcast-rpc-address', default=None, dest='broadcastRpcAddress')
+    return parser.parse_args()
--- a/dist/docker/redhat/docker-entrypoint.py
+++ b/dist/docker/redhat/docker-entrypoint.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+import os
+import scyllasetup
+import logging
+import commandlineparser
+
+logging.basicConfig(filename="/var/log/scylla/docker-entrypoint.log", level=logging.DEBUG, format="%(message)s")
+
+try:
+    arguments = commandlineparser.parse()
+    setup = scyllasetup.ScyllaSetup(arguments)
+    setup.developerMode()
+    setup.cpuSet()
+    setup.io()
+    setup.scyllaYAML()
+    setup.cqlshrc()
+    setup.arguments()
+    os.system("/usr/bin/supervisord -c /etc/supervisord.conf")
+except:
+    logging.exception('failed!')
--- a/dist/docker/redhat/etc/supervisord.conf
+++ b/dist/docker/redhat/etc/supervisord.conf
@@ -0,0 +1,13 @@
+[supervisord]
+nodaemon=true
+
+[inet_http_server]
+port = 127.0.0.1:9001
+
+[rpcinterface:supervisor]
+supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
+
+[supervisorctl]
+
+[include]
+files = /etc/supervisord.conf.d/*.conf
--- a/dist/docker/redhat/etc/supervisord.conf.d/scylla-jmx.conf
+++ b/dist/docker/redhat/etc/supervisord.conf.d/scylla-jmx.conf
@@ -0,0 +1,6 @@
+[program:scylla-jmx]
+command=/scylla-jmx-service.sh
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
--- a/dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
@@ -0,0 +1,6 @@
+[program:scylla]
+command=/scylla-service.sh
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
--- a/dist/docker/redhat/etc/sysconfig/scylla-server
+++ b/dist/docker/redhat/etc/sysconfig/scylla-server
@@ -0,0 +1,41 @@
+# choose following mode: virtio, dpdk, posix
+NETWORK_MODE=posix
+
+# tap device name(virtio)
+TAP=tap0
+
+# bridge device name (virtio)
+BRIDGE=virbr0
+
+# ethernet device name
+IFNAME=eth0
+
+# setup NIC's interrupts, RPS, XPS (posix)
+SET_NIC=no
+
+# ethernet device driver (dpdk)
+ETHDRV=
+
+# ethernet device PCI ID (dpdk)
+ETHPCIID=
+
+# number of hugepages
+NR_HUGEPAGES=64
+
+# user for process (must be root for dpdk)
+USER=scylla
+
+# group for process
+GROUP=scylla
+
+# scylla home dir
+SCYLLA_HOME=/var/lib/scylla
+
+# scylla config dir
+SCYLLA_CONF=/etc/scylla
+
+# scylla arguments
+SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --collectd-address=127.0.0.1:25826 --collectd=1 --collectd-poll-period 3000 --network-stack posix"
+
+# setup as AMI instance
+AMI=no
--- a/dist/docker/redhat/scylla-jmx-service.sh
+++ b/dist/docker/redhat/scylla-jmx-service.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+source /etc/sysconfig/scylla-jmx
+
+exec /usr/lib/scylla/jmx/scylla-jmx -l /usr/lib/scylla/jmx
--- a/dist/docker/redhat/scylla-service.sh
+++ b/dist/docker/redhat/scylla-service.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+. /usr/lib/scylla/scylla_prepare
+
+export SCYLLA_HOME SCYLLA_CONF
+
+exec /usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET $SCYLLA_DOCKER_ARGS
--- a/dist/docker/redhat/scylla_bashrc
+++ b/dist/docker/redhat/scylla_bashrc
@@ -0,0 +1,18 @@
+echo > /dev/stderr
+echo '   _____            _ _       _____  ____  ' > /dev/stderr
+echo '  / ____|          | | |     |  __ \|  _ \ ' > /dev/stderr
+echo ' | (___   ___ _   _| | | __ _| |  | | |_) |' > /dev/stderr
+echo '  \___ \ / __| | | | | |/ _` | |  | |  _ < ' > /dev/stderr
+echo '  ____) | (__| |_| | | | (_| | |__| | |_) |' > /dev/stderr
+echo ' |_____/ \___|\__, |_|_|\__,_|_____/|____/ ' > /dev/stderr
+echo '               __/ |                       ' > /dev/stderr
+echo '              |___/                        ' > /dev/stderr
+echo '' > /dev/stderr
+echo '' > /dev/stderr
+echo 'Nodetool:' > /dev/stderr
+echo '	nodetool help' > /dev/stderr
+echo 'CQL Shell:' > /dev/stderr
+echo '	cqlsh' > /dev/stderr
+echo 'More documentation available at: ' > /dev/stderr
+echo '	http://www.scylladb.com/doc/' > /dev/stderr
+echo > /dev/stderr
--- a/dist/docker/redhat/scyllasetup.py
+++ b/dist/docker/redhat/scyllasetup.py
@@ -0,0 +1,68 @@
+import subprocess
+import logging
+import yaml
+import os
+
+class ScyllaSetup:
+    def __init__(self, arguments):
+        self._developerMode = arguments.developerMode
+        self._seeds = arguments.seeds
+        self._cpuset = arguments.cpuset
+        self._broadcastAddress = arguments.broadcastAddress
+        self._broadcastRpcAddress = arguments.broadcastRpcAddress
+        self._smp = arguments.smp
+        self._memory = arguments.memory
+        self._overprovisioned = arguments.overprovisioned
+
+    def _run(self, *args, **kwargs):
+        logging.info('running: {}'.format(args))
+        subprocess.check_call(*args, **kwargs)
+
+    def developerMode(self):
+        self._run(['/usr/lib/scylla/scylla_dev_mode_setup', '--developer-mode', self._developerMode])
+
+    def cpuSet(self):
+        if self._cpuset is None:
+            return
+        self._run(['/usr/lib/scylla/scylla_cpuset_setup', '--cpuset', self._cpuset])
+
+    def io(self):
+        self._run(['/usr/lib/scylla/scylla_io_setup'])
+
+    def scyllaYAML(self):
+        configuration = yaml.load(open('/etc/scylla/scylla.yaml'))
+        IP = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
+        configuration['listen_address'] = IP
+        configuration['rpc_address'] = IP
+        if self._seeds is None:
+            if self._broadcastAddress is not None:
+                self._seeds = self._broadcastAddress
+            else:
+                self._seeds = IP
+        configuration['seed_provider'] = [
+                {'class_name': 'org.apache.cassandra.locator.SimpleSeedProvider',
+                 'parameters': [{'seeds': self._seeds}]}
+                ]
+        if self._broadcastAddress is not None:
+            configuration['broadcast_address'] = self._broadcastAddress
+        if self._broadcastRpcAddress is not None:
+            configuration['broadcast_rpc_address'] = self._broadcastRpcAddress
+        with open('/etc/scylla/scylla.yaml', 'w') as file:
+            yaml.dump(configuration, file)
+
+    def cqlshrc(self):
+        home = os.environ['HOME']
+        hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
+        with open("%s/.cqlshrc" % home, "w") as cqlshrc:
+            cqlshrc.write("[connection]\nhostname = %s\n" % hostname)
+
+    def arguments(self):
+        args = ""
+        if self._memory is not None:
+            args += "--memory %s" % self._memory
+        if self._smp is not None:
+            args += " --smp %s" % self._smp
+        if self._overprovisioned == "1":
+            args += " --overprovisioned"
+        with open("/etc/scylla.d/docker.conf", "w") as cqlshrc:
+            cqlshrc.write("SCYLLA_DOCKER_ARGS=\"%s\"\n" % args)
--- a/dist/docker/redhat/start-scylla
+++ b/dist/docker/redhat/start-scylla
@@ -1,54 +0,0 @@
-#!/bin/sh
-
- . /etc/sysconfig/scylla-server
-
-CPUSET=""
-if [ x"$SCYLLA_CPU_SET" != "x" ]; then
-	CPUSET="--cpuset $SCYLLA_CPU_SET"
-fi
-
-if [ "$SCYLLA_PRODUCTION" == "true" ]; then
-	DEV_MODE=""
-	if [ ! -f /var/lib/scylla/.io_setup_done ]; then
-		DATA_DIR=`/usr/lib/scylla/scylla_config_get.py --config $SCYLLA_CONF/scylla.yaml --get data_file_directories|head -n1`
-		iotune --evaluation-directory $DATA_DIR --format envfile --options-file /var/lib/scylla/io.conf $CPUSET --timeout 600
-		if [ $? -ne 0 ]; then
-			echo "/var/lib/scylla did not pass validation tests, it may not be on XFS and/or has limited disk space."
-			echo "This is a non-supported setup, please bind mount an XFS volume."
-			exit 1
-		fi
-		touch /var/lib/scylla/.io_setup_done
-	fi
-	source /var/lib/scylla/io.conf
-else
-	DEV_MODE="--developer-mode true"
-fi
-
-IP=$(hostname -i)
-
-if [ x"$SCYLLA_SEEDS" != "x" ];then
-	SEEDS="$SCYLLA_SEEDS"
-else
-	SEEDS="$IP"
-fi
-
-sed -e "s/seeds:.*/seeds: $SEEDS/g" /var/lib/scylla/conf/scylla.yaml > $HOME/scylla.yaml
-
-if [ x"$SCYLLA_BROADCAST_ADDRESS" != "x" ];then
-	sed -i "s/.*broadcast_address:.*/broadcast_address: $SCYLLA_BROADCAST_ADDRESS/g" $HOME/scylla.yaml
-fi
-
-/usr/bin/scylla --log-to-syslog 1 \
-                --log-to-stdout 0 \
-                $DEV_MODE \
-                $SEASTAR_IO \
-                $CPUSET \
-                --default-log-level info \
-                --options-file $HOME/scylla.yaml \
-                --listen-address $IP \
-                --rpc-address $IP \
-                --network-stack posix &> /dev/null &
-
-source /etc/sysconfig/scylla-jmx
-export SCYLLA_HOME SCYLLA_CONF
-exec /usr/lib/scylla/jmx/scylla-jmx -l /usr/lib/scylla/jmx &> /dev/null &
--- a/dist/redhat/build_rpm.sh
+++ b/dist/redhat/build_rpm.sh
@@ -5,10 +5,12 @@ print_usage() {
    echo "build_rpm.sh --rebuild-dep --jobs 2"
    echo "  --rebuild-dep  rebuild dependency packages (CentOS)"
    echo "  --jobs  specify number of jobs"
+    echo "  --dist  create a public distribution rpm"
    exit 1
 }
 REBUILD=0
 JOBS=0
+DIST=0
 while [ $# -gt 0 ]; do
    case "$1" in
        "--rebuild-dep")
@@ -19,6 +21,10 @@ while [ $# -gt 0 ]; do
            JOBS=$2
            shift 2
            ;;
+        "--dist")
+            DIST=1
+            shift 1
+            ;;
        *)
            print_usage
            ;;
@@ -62,6 +68,13 @@ rm -f version
 cp dist/redhat/scylla.spec.in $RPMBUILD/SPECS/scylla.spec
 sed -i -e "s/@@VERSION@@/$SCYLLA_VERSION/g" $RPMBUILD/SPECS/scylla.spec
 sed -i -e "s/@@RELEASE@@/$SCYLLA_RELEASE/g" $RPMBUILD/SPECS/scylla.spec
+
+if [ $DIST -gt 0 ]; then
+  sed -i -e "s/@@HOUSEKEEPING_CONF@@/true/g" $RPMBUILD/SPECS/scylla.spec
+else
+  sed -i -e "s/@@HOUSEKEEPING_CONF@@/false/g" $RPMBUILD/SPECS/scylla.spec
+fi
+
 if [ "$ID" = "fedora" ]; then
    if [ $JOBS -gt 0 ]; then
        rpmbuild -bs --define "_topdir $RPMBUILD" --define "_smp_mflags -j$JOBS" $RPMBUILD/SPECS/scylla.spec
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -30,7 +30,7 @@ URL:            http://www.scylladb.com/
 BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel openssl-devel libcap-devel libselinux-devel libgcrypt-devel libgpg-error-devel elfutils-devel krb5-devel libcom_err-devel libattr-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel
 %{?fedora:BuildRequires: boost-devel ninja-build ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
 %{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-ninja-build scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
-Requires:       scylla-conf systemd-libs hwloc collectd PyYAML python-urwid
+Requires:       scylla-conf systemd-libs hwloc collectd PyYAML python-urwid pyparsing python-requests curl bc util-linux
 Conflicts:      abrt

 %description server
@@ -42,6 +42,7 @@ This package contains ScyllaDB server.
    %{nil}

 %build
+%define is_housekeeping_conf %( if @@HOUSEKEEPING_CONF@@; then echo "1" ; else echo "0"; fi )
 %if 0%{?fedora}
 ./configure.py --disable-xen --enable-dpdk --mode=release
 %endif
@@ -50,8 +51,6 @@ python3.4 ./configure.py --disable-xen --enable-dpdk --mode=release --static-std
 %endif
 ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
 cp dist/common/systemd/scylla-server.service.in build/scylla-server.service
-cp dist/common/systemd/scylla-housekeeping.service.in build/scylla-housekeeping.service
-cp dist/common/systemd/scylla-housekeeping.timer.in build/scylla-housekeeping.timer
 sed -i -e "s#@@SYSCONFDIR@@#/etc/sysconfig#g" build/scylla-server.service

 %install
@@ -77,7 +76,8 @@ install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
 install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 build/*.service $RPM_BUILD_ROOT%{_unitdir}/
-install -m644 build/*.timer $RPM_BUILD_ROOT%{_unitdir}/
+install -m644 dist/common/systemd/*.service  $RPM_BUILD_ROOT%{_unitdir}/
+install -m644 dist/common/systemd/*.timer $RPM_BUILD_ROOT%{_unitdir}/
 install -m755 dist/common/scripts/* $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
 install -m755 seastar/scripts/posix_net_conf.sh  $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
 install -m755 seastar/dpdk/tools/dpdk_nic_bind.py $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
@@ -85,6 +85,9 @@ install -m755 build/release/scylla $RPM_BUILD_ROOT%{_bindir}
 install -m755 build/release/iotune $RPM_BUILD_ROOT%{_bindir}
 install -m755 dist/common/bin/scyllatop $RPM_BUILD_ROOT%{_bindir}
 install -m755 scylla-housekeeping $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
+if @@HOUSEKEEPING_CONF@@; then
+    install -m644 conf/housekeeping.cfg $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
+fi
 install -d -m755 $RPM_BUILD_ROOT%{_docdir}/scylla
 install -m644 README.md $RPM_BUILD_ROOT%{_docdir}/scylla/
 install -m644 README-DPDK.md $RPM_BUILD_ROOT%{_docdir}/scylla/
@@ -202,6 +205,10 @@ mv /tmp/scylla.yaml /etc/scylla/scylla.yaml
 %attr(0755,root,root) %dir %{_sysconfdir}/scylla
 %config(noreplace) %{_sysconfdir}/scylla/scylla.yaml
 %config(noreplace) %{_sysconfdir}/scylla/cassandra-rackdc.properties
+%if %is_housekeeping_conf
+%config(noreplace) %{_sysconfdir}/scylla/housekeeping.cfg
+%endif
+

 %package kernel-conf
 Group:          Applications/Databases
--- a/dist/ubuntu/build_deb.sh
+++ b/dist/ubuntu/build_deb.sh
@@ -2,16 +2,22 @@

 print_usage() {
    echo "build_deb.sh --rebuild-dep"
+    echo "  --dist  create a public distribution package"
    echo "  --rebuild-dep  rebuild dependency packages"
    exit 1
 }
 REBUILD=0
+DIST=0
 while [ $# -gt 0 ]; do
    case "$1" in
        "--rebuild-dep")
            REBUILD=1
            shift 1
            ;;
+        "--dist")
+            DIST=1
+            shift 1
+            ;;
        *)
            print_usage
            ;;
@@ -74,13 +80,19 @@ if [ "$RELEASE" = "14.04" ]; then
    sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/g++-5/g" debian/control
    sed -i -e "s#@@INSTALL@@#dist/ubuntu/sudoers.d/scylla etc/sudoers.d#g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER@@##g" debian/scylla-server.install
 else
    sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
    sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
    sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++/g" debian/control
    sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
+    sed -i -e "s#@@HKDOTTIMER@@#dist/common/systemd/scylla-housekeeping.timer /lib/systemd/system#g" debian/scylla-server.install
+fi
+if [ $DIST -gt 0 ]; then
+    sed -i -e "s#@@ADDHKCFG@@#conf/housekeeping.cfg etc/scylla.d/#g" debian/scylla-server.install
+else
+    sed -i -e "s#@@ADDHKCFG@@##g" debian/scylla-server.install
 fi
-
 if [ "$DISTRIBUTION" = "Ubuntu" ]; then
    sed -i -e "s/@@DEPENDS@@/hugepages, /g" debian/control
 else
@@ -89,6 +101,7 @@ fi

 cp dist/common/systemd/scylla-server.service.in debian/scylla-server.service
 sed -i -e "s#@@SYSCONFDIR@@#/etc/default#g" debian/scylla-server.service
+cp dist/common/systemd/scylla-housekeeping.service debian/scylla-server.scylla-housekeeping.service

 if [ "$RELEASE" = "14.04" ] && [ $REBUILD -eq 0 ]; then
    if [ ! -f /etc/apt/sources.list.d/scylla-3rdparty-trusty.list ]; then
--- a/dist/ubuntu/control.in
+++ b/dist/ubuntu/control.in
@@ -16,7 +16,7 @@ Conflicts: scylla-server (<< 1.1)

 Package: scylla-server
 Architecture: amd64
-Depends: ${shlibs:Depends}, ${misc:Depends}, adduser, hwloc-nox, collectd, scylla-conf, python-yaml, python-urwid, python3-requests, @@DEPENDS@@
+Depends: ${shlibs:Depends}, ${misc:Depends}, adduser, hwloc-nox, collectd, scylla-conf, python-yaml, python-urwid, python-requests, curl, bc, util-linux, @@DEPENDS@@
 Description: Scylla database server binaries 
 Scylla is a highly scalable, eventually consistent, distributed,
 partitioned row DB.
--- a/dist/ubuntu/debian/scylla-server.scylla-housekeeping.upstart
+++ b/dist/ubuntu/debian/scylla-server.scylla-housekeeping.upstart
--- a/dist/ubuntu/rules.in
+++ b/dist/ubuntu/rules.in
@@ -11,7 +11,7 @@ override_dh_auto_clean:

 override_dh_installinit:
 	dh_installinit --no-start @@DH_INSTALLINIT@@
-	dh_installinit --no-start --name scylla-timer @@DH_INSTALLINIT@@
+	dh_installinit --no-start --name scylla-housekeeping @@DH_INSTALLINIT@@

 override_dh_strip:
 	dh_strip --dbg-package=scylla-server-dbg
--- a/dist/ubuntu/scylla-server.install.in
+++ b/dist/ubuntu/scylla-server.install.in
@@ -14,4 +14,6 @@ build/release/scylla usr/bin
 build/release/iotune usr/bin
 dist/common/bin/scyllatop usr/bin
 dist/common/sbin/* usr/sbin
+@@ADDHKCFG@@
+@@HKDOTTIMER@@
@@INSTALL@@
--- a/docs/docker-hub.md
+++ b/docs/docker-hub.md
@@ -0,0 +1,191 @@
+# What is ScyllaDB ?
+
+ScyllaDB is a high-performance NoSQL database system, fully compatible with Apache Cassandra.
+ScyllaDB is released under the GNU Affero General Public License version 3 and the Apache License, ScyllaDB is free and open-source software.
+
+> [ScyllaDB](http://www.scylladb.com/)
+
+![logo](http://www.scylladb.com/img/mascot_1.png)
+
+# How to use this image
+
+## Start a `scylla` server instance
+
+```console
+$ docker run --name some-scylla -d scylladb/scylla
+```
+
+## Run `nodetool` utility
+
+```console
+$ docker exec -it some-scylla nodetool status
+Datacenter: datacenter1
+=======================
+Status=Up/Down
+|/ State=Normal/Leaving/Joining/Moving
+--  Address     Load       Tokens  Owns (effective)  Host ID                               Rack
+UN  172.17.0.2  125.51 KB  256     100.0%            c9155121-786d-44f8-8667-a8b915b95665  rack1
+```
+
+## Run `cqlsh` utility
+
+```console
+$ docker exec -it some-scylla cqlsh
+Connected to Test Cluster at 172.17.0.2:9042.
+[cqlsh 5.0.1 | Cassandra 2.1.8 | CQL spec 3.2.1 | Native protocol v3]
+Use HELP for help.
+cqlsh>
+```
+
+## Make a cluster
+
+```console
+$ docker run --name some-scylla2 -d scylladb/scylla --seeds="$(docker inspect --format='{{ .NetworkSettings.IPAddress }}' some-scylla)"
+```
+
+## Check `scylla` logs
+
+```console
+$ docker logs some-scylla | tail
+INFO  2016-08-04 06:57:40,836 [shard 5] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
+INFO  2016-08-04 06:57:40,836 [shard 3] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
+INFO  2016-08-04 06:57:40,836 [shard 1] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
+INFO  2016-08-04 06:57:40,836 [shard 2] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
+INFO  2016-08-04 06:57:40,836 [shard 4] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
+INFO  2016-08-04 06:57:40,836 [shard 7] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
+INFO  2016-08-04 06:57:40,837 [shard 6] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
+INFO  2016-08-04 06:57:40,839 [shard 0] database - Schema version changed to fea14d93-9c5a-34f5-9d0e-2e49dcfa747e
+INFO  2016-08-04 06:57:40,839 [shard 0] storage_service - Starting listening for CQL clients on 172.17.0.2:9042...
+INFO  2016-08-04 06:57:40,840 [shard 0] storage_service - Thrift server listening on 172.17.0.2:9160 ...
+```
+
+## Configuring data volume for storage
+
+You can use Docker volumes to improve performance of Scylla.
+
+Create a Scylla data directory ``/var/lib/scylla`` on the host, which is used by Scylla container to store all data:
+
+```console
+$ sudo mkdir -p /var/lib/scylla/data /var/lib/scylla/commitlog
+```
+
+Launch Scylla using Docker's ``--volume`` command line option to mount the created host directory as a data volume in the container and disable Scylla's developer mode to run I/O tuning before starting up the Scylla node.
+
+```console
+$ docker run --name some-scylla --volume /var/lib/scylla:/var/lib/scylla -d scylladb/scylla --developer-mode=0
+```
+
+## Configuring resource limits
+
+Scylla utilizes all CPUs and all memory by default.
+To configure resource limits for your Docker container, you can use the `--smp`, `--memory`, and `--cpuset` command line options documented in the section "Command-line options".
+
+If you run multiple Scylla instances on the same machine, it is highly recommended that you enable the `--overprovisioned` command line option, which enables certain optimizations for Scylla to run efficiently in an overprovisioned environment.
+
+## Command-line options
+
+The Scylla image supports many command line options that are passed to the `docker run` command.
+
+### `--seeds SEEDS`
+
+The `-seeds` command line option configures Scylla's seed nodes.
+If no `--seeds` option is specified, Scylla uses its own IP address as the seed.
+
+For example, to configure Scylla to run with two seed nodes `192.168.0.100` and `192.168.0.200`.
+
+```console
+$ docker run --name some-scylla -d scylladb/scylla --seeds 192.168.0.100,192.168.0.200
+```
+
+### `--broadcast-address ADDR`
+
+The `--broadcast-address` command line option configures the IP address the Scylla instance tells other Scylla nodes in the cluster to connect to.
+
+For example, to configure Scylla to use broadcast address `10.0.0.5`:
+
+```console
+$ docker run --name some-scylla -d scylladb/scylla --broadcast-address 10.0.0.5
+```
+
+### `--broadcast-rpc-address ADDR`
+
+The `--broadcast-rpc-address` command line option configures the IP address the Scylla instance tells clients to connect to.
+
+For example, to configure Scylla to use broadcast RPC address `10.0.0.5`:
+
+```console
+$ docker run --name some-scylla -d scylladb/scylla --broadcast-rpc-address 10.0.0.5
+```
+
+### `--smp COUNT`
+
+The `--smp` command line option restricts Scylla to `COUNT` number of CPUs.
+The option does not, however, mandate a specific placement of CPUs.
+See the `--cpuset` command line option if you need Scylla to run on specific CPUs.
+
+For example, to restrict Scylla to 2 CPUs:
+
+```console
+$ docker run --name some-scylla -d scylladb/scylla --smp 2
+```
+
+### `--memory AMOUNT`
+
+The `--memory` command line options restricts Scylla to use up to `AMOUNT` of memory.
+The `AMOUNT` value supports both `M` unit for megabytes and `G` unit for gigabytes.
+
+For example, to restrict Scylla to 4 GB of memory:
+
+```console
+$ docker run --name some-scylla -d scylladb/scylla --memory 4G
+```
+
+### `--overprovisioned ENABLE`
+
+The `--overprovisioned` command line option enables or disables optimizations for running Scylla in an overprovisioned environment.
+If no `--overprovisioned` option is specified, Scylla defaults to running with optimizations *disabled*.
+
+For example, to enable optimizations for running in an overprovisioned environment:
+
+```console
+$ docker run --name some-scylla -d scylladb/scylla --overprovisioned 1
+```
+
+### `--cpuset CPUSET`
+
+The `--cpuset` command line option restricts Scylla to run on only on CPUs specified by `CPUSET`.
+The `CPUSET` value is either a single CPU (e.g. `--cpuset 1`), a range (e.g. `--cpuset 2-3`), or a list (e.g. `--cpuset 1,2,5`), or a combination of the last two options (e.g. `--cpuset 1-2,5`).
+
+For example, to restrict Scylla to run on physical CPUs 0 to 2 and 4:
+
+```console
+$ docker run --name some-scylla -d scylladb/scylla --cpuset 0-2,4
+```
+
+### `--developer-mode ENABLE`
+
+The `--developer-mode` command line option enables Scylla's developer mode, which relaxes checks for things like XFS and enables Scylla to run on unsupported configurations (which usually results in suboptimal performance).
+If no `--developer-mode` command line option is defined, Scylla defaults to running with developer mode *enabled*.
+It is highly recommended to disable developer mode for production deployments to ensure Scylla is able to run with maximum performance.
+
+For example, to disable developer mode:
+
+```console
+$ docker run --name some-scylla -d scylladb/scylla --developer-mode 0
+```
+
+# User Feedback
+
+## Issues
+
+For bug reports, please use Scylla's [issue tracker](https://github.com/scylladb/scylla/issues) on GitHub.
+Please read the [How to report a Scylla problem](https://github.com/scylladb/scylla/wiki/How-to-report-a-Scylla-problem) page before you report bugs.
+
+For general help, see Scylla's [documentation](http://www.scylladb.com/doc/).
+For questions and comments, use Scylla's [mailing lists](http://www.scylladb.com/community/).
+
+## Contributing
+
+Want to scratch your own itch and contribute a patch.
+We are eager to review and merge your code.
+Please consult the [Contributing on Scylla page](http://www.scylladb.com/kb/contributing/)
--- a/gms/endpoint_state.hh
+++ b/gms/endpoint_state.hh
@@ -54,7 +54,7 @@ namespace gms {
 */
 class endpoint_state {
 public:
-    using clk = std::chrono::steady_clock;
+    using clk = std::chrono::system_clock;
 private:
    heart_beat_state _heart_beat_state;
    std::map<application_state, versioned_value> _application_state;
--- a/gms/failure_detector.hh
+++ b/gms/failure_detector.hh
@@ -58,7 +58,7 @@ class endpoint_state;

 class arrival_window {
 public:
-    using clk = std::chrono::steady_clock;
+    using clk = std::chrono::system_clock;
 private:
    clk::time_point _tlast{clk::time_point::min()};
    utils::bounded_stats_deque _arrival_intervals;
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -55,6 +55,7 @@
 #include "log.hh"
 #include <seastar/core/sleep.hh>
 #include <seastar/core/thread.hh>
+#include <seastar/core/scollectd.hh>
 #include <chrono>
 #include "dht/i_partitioner.hh"
 #include <boost/range/algorithm/set_algorithm.hpp>
@@ -112,6 +113,24 @@ gossiper::gossiper() {
    /* register with the Failure Detector for receiving Failure detector events */
    get_local_failure_detector().register_failure_detection_event_listener(this);
    // Register this instance with JMX
+    _collectd_registrations = std::make_unique<scollectd::registrations>(setup_collectd());
+}
+
+scollectd::registrations
+gossiper::setup_collectd() {
+    auto ep = get_broadcast_address();
+    return {
+        scollectd::add_polled_metric(
+            scollectd::type_instance_id("gossip", scollectd::per_cpu_plugin_instance,
+                    "derive", "heart_beat_version"),
+            scollectd::make_typed(scollectd::data_type::DERIVE, [ep, this] {
+                if (this->endpoint_state_map.count(ep)) {
+                    return this->endpoint_state_map.at(ep).get_heart_beat_state().get_heart_beat_version();
+                } else {
+                    return 0;
+                }
+            })),
+    };
 }

 void gossiper::set_last_processed_message_at() {
@@ -463,8 +482,6 @@ void gossiper::remove_endpoint(inet_address endpoint) {
    _live_endpoints.erase(endpoint);
    _live_endpoints_just_added.remove(endpoint);
    _unreachable_endpoints.erase(endpoint);
-    // do not remove endpointState until the quarantine expires
-    get_local_failure_detector().remove(endpoint);
    quarantine_endpoint(endpoint);
    logger.debug("removing endpoint {}", endpoint);
 }
@@ -762,6 +779,7 @@ void gossiper::evict_from_membership(inet_address endpoint) {
    _unreachable_endpoints.erase(endpoint);
    endpoint_state_map.erase(endpoint);
    _expire_time_endpoint_map.erase(endpoint);
+    get_local_failure_detector().remove(endpoint);
    quarantine_endpoint(endpoint);
    logger.debug("evicting {} from gossip", endpoint);
 }
@@ -1582,7 +1600,7 @@ void gossiper::dump_endpoint_state_map() {
 }

 void gossiper::debug_show() {
-    auto reporter = std::make_shared<timer<clk>>();
+    auto reporter = std::make_shared<timer<std::chrono::steady_clock>>();
    reporter->set_callback ([reporter] {
        auto& gossiper = gms::get_local_gossiper();
        gossiper.dump_endpoint_state_map();
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -56,6 +56,7 @@
 #include <chrono>
 #include <set>
 #include <seastar/core/condition-variable.hh>
+#include <seastar/core/scollectd.hh>

 namespace gms {

@@ -81,7 +82,7 @@ class i_failure_detector;
 */
 class gossiper : public i_failure_detection_event_listener, public seastar::async_sharded_service<gossiper> {
 public:
-    using clk = std::chrono::steady_clock;
+    using clk = std::chrono::system_clock;
 private:
    using messaging_verb = net::messaging_verb;
    using messaging_service = net::messaging_service;
@@ -101,7 +102,7 @@ private:
        return msg_addr{to, _default_cpuid};
    }
    void do_sort(std::vector<gossip_digest>& g_digest_list);
-    timer<clk> _scheduled_gossip_task;
+    timer<std::chrono::steady_clock> _scheduled_gossip_task;
    bool _enabled = false;
    std::set<inet_address> _seeds_from_config;
    sstring _cluster_name;
@@ -537,6 +538,9 @@ private:
    void register_feature(feature* f);
    void unregister_feature(feature* f);
    void maybe_enable_features();
+private:
+    std::unique_ptr<scollectd::registrations> _collectd_registrations;
+    scollectd::registrations setup_collectd();
 };

 extern distributed<gossiper> _the_gossiper;
--- a/keys.cc
+++ b/keys.cc
@@ -23,6 +23,7 @@

 #include "keys.hh"
 #include "dht/i_partitioner.hh"
+#include "clustering_bounds_comparator.hh"

 std::ostream& operator<<(std::ostream& out, const partition_key& pk) {
    return out << "pk{" << to_hex(pk) << "}";
@@ -52,3 +53,43 @@ partition_key_view::ring_order_tri_compare(const schema& s, partition_key_view k
    }
    return legacy_tri_compare(s, k2);
 }
+
+std::ostream& operator<<(std::ostream& out, const bound_kind k) {
+    switch(k) {
+    case bound_kind::excl_end:
+        return out << "excl end";
+    case bound_kind::incl_start:
+        return out << "incl start";
+    case bound_kind::incl_end:
+        return out << "incl end";
+    case bound_kind::excl_start:
+        return out << "excl start";
+    }
+    abort();
+}
+
+bound_kind invert_kind(bound_kind k) {
+    switch(k) {
+    case bound_kind::excl_start: return bound_kind::incl_end;
+    case bound_kind::incl_start: return bound_kind::excl_end;
+    case bound_kind::excl_end:   return bound_kind::incl_start;
+    case bound_kind::incl_end:   return bound_kind::excl_start;
+    }
+    abort();
+}
+
+int32_t weight(bound_kind k) {
+    switch(k) {
+    case bound_kind::excl_end:
+        return -2;
+    case bound_kind::incl_start:
+        return -1;
+    case bound_kind::incl_end:
+        return 1;
+    case bound_kind::excl_start:
+        return 2;
+    }
+    abort();
+}
+
+const thread_local clustering_key_prefix bound_view::empty_prefix = clustering_key::make_empty();
--- a/mutation.cc
+++ b/mutation.cc
@@ -213,51 +213,90 @@ mutation& mutation::operator=(const mutation& m) {
    return *this = mutation(m);
 }

-future<mutation_opt> mutation_from_streamed_mutation(streamed_mutation_opt sm)
-{
-    class rebuilder {
-        mutation& _m;
-    public:
-        rebuilder(mutation& m) : _m(m) { }
+enum class limit_mutation_size { yes, no };

-        stop_iteration consume(tombstone t) {
-            _m.partition().apply(t);
-            return stop_iteration::no;
+template <limit_mutation_size with_limit>
+class mutation_rebuilder {
+    mutation _m;
+    streamed_mutation& _sm;
+    size_t _remaining_limit;
+
+    template <typename T> bool check_remaining_limit(const T& e) {
+        if (with_limit == limit_mutation_size::no) {
+            return true;
        }
-
-        stop_iteration consume(static_row&& sr) {
-            _m.partition().static_row().apply(*_m.schema(), column_kind::static_column, std::move(sr.cells()));
-            return stop_iteration::no;
+        size_t size = e.memory_usage();
+        if (_remaining_limit <= size) {
+            _remaining_limit = 0;
+        } else {
+            _remaining_limit -= size;
        }
+        return _remaining_limit > 0;
+    }
+public:
+    mutation_rebuilder(streamed_mutation& sm)
+        : _m(sm.decorated_key(), sm.schema()), _sm(sm), _remaining_limit(0) {
+        static_assert(with_limit == limit_mutation_size::no,
+                     "This constructor should be used only for mutation_rebuildeer with no limit");
+    }
+    mutation_rebuilder(streamed_mutation& sm, size_t limit)
+        : _m(sm.decorated_key(), sm.schema()), _sm(sm), _remaining_limit(limit) {
+        static_assert(with_limit == limit_mutation_size::yes,
+                      "This constructor should be used only for mutation_rebuildeer with limit");
+        check_remaining_limit(_m.key());
+    }

-        stop_iteration consume(range_tombstone&& rt) {
-            _m.partition().apply_row_tombstone(*_m.schema(), std::move(rt));
-            return stop_iteration::no;
+    stop_iteration consume(tombstone t) {
+        _m.partition().apply(t);
+        return stop_iteration::no;
+    }
+
+    stop_iteration consume(range_tombstone&& rt) {
+        if (!check_remaining_limit(rt)) {
+            return stop_iteration::yes;
        }
+        _m.partition().apply_row_tombstone(*_m.schema(), std::move(rt));
+        return stop_iteration::no;
+    }

-        stop_iteration consume(clustering_row&& cr) {
-            auto& dr = _m.partition().clustered_row(std::move(cr.key()));
-            dr.apply(cr.tomb());
-            dr.apply(cr.marker());
-            dr.cells().apply(*_m.schema(), column_kind::regular_column, std::move(cr.cells()));
-            return stop_iteration::no;
+    stop_iteration consume(static_row&& sr) {
+        if (!check_remaining_limit(sr)) {
+            return stop_iteration::yes;
        }
+        _m.partition().static_row().apply(*_m.schema(), column_kind::static_column, std::move(sr.cells()));
+        return stop_iteration::no;
+    }

-        void consume_end_of_stream() { }
-    };
+    stop_iteration consume(clustering_row&& cr) {
+        if (!check_remaining_limit(cr)) {
+            return stop_iteration::yes;
+        }
+        auto& dr = _m.partition().clustered_row(std::move(cr.key()));
+        dr.apply(cr.tomb());
+        dr.apply(cr.marker());
+        dr.cells().apply(*_m.schema(), column_kind::regular_column, std::move(cr.cells()));
+        return stop_iteration::no;
+    }

-    struct data {
-        mutation m;
-        streamed_mutation sm;
-    };
+    mutation_opt consume_end_of_stream() {
+        return with_limit == limit_mutation_size::yes && _remaining_limit == 0 ? mutation_opt()
+                                                                               : mutation_opt(std::move(_m));
+    }
+};

+future<mutation_opt>
+mutation_from_streamed_mutation_with_limit(streamed_mutation sm, size_t limit) {
+    return do_with(std::move(sm), [limit] (auto& sm) {
+        return consume(sm, mutation_rebuilder<limit_mutation_size::yes>(sm, limit));
+    });
+}
+
+future<mutation_opt> mutation_from_streamed_mutation(streamed_mutation_opt sm) {
    if (!sm) {
        return make_ready_future<mutation_opt>();
    }
-    mutation m(sm->decorated_key(), sm->schema());
-    return do_with(data { std::move(m), std::move(*sm) }, [] (auto& d) {
-        return consume(d.sm, rebuilder(d.m)).then([&d] {
-            return mutation_opt(std::move(d.m));
-        });
+    return do_with(std::move(*sm), [] (auto& sm) {
+        return consume(sm, mutation_rebuilder<limit_mutation_size::no>(sm));
    });
 }
+
--- a/mutation.hh
+++ b/mutation.hh
@@ -182,3 +182,5 @@ boost::iterator_range<std::vector<mutation>::const_iterator> slice(
    const query::partition_range&);

 future<mutation_opt> mutation_from_streamed_mutation(streamed_mutation_opt sm);
+future<mutation_opt>
+mutation_from_streamed_mutation_with_limit(streamed_mutation sm, size_t limit);
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -231,7 +231,7 @@ public:
            }

            _row_limit -= _rows_in_current_partition;
-            _partition_limit -= 1;
+            _partition_limit -= _rows_in_current_partition > 0;
            _consumer.consume_end_of_partition();
            if (!sstable_compaction()) {
                return _row_limit && _partition_limit ? stop_iteration::no : stop_iteration::yes;
@@ -253,4 +253,4 @@ struct compact_for_query : compact_mutation<only_live, compact_for_sstables::no,
 template<typename CompactedMutationsConsumer>
 struct compact_for_compaction : compact_mutation<emit_only_live_rows::no, compact_for_sstables::yes, CompactedMutationsConsumer> {
    using compact_mutation<emit_only_live_rows::no, compact_for_sstables::yes, CompactedMutationsConsumer>::compact_mutation;
-};
+};
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -57,6 +57,14 @@ struct reversal_traits<false> {
        return c.erase_and_dispose(begin, end, std::move(disposer));
    }

+    template<typename Container, typename Disposer>
+    static typename Container::iterator erase_dispose_and_update_end(Container& c,
+         typename Container::iterator it, Disposer&& disposer,
+         typename Container::iterator&)
+    {
+        return c.erase_and_dispose(it, std::forward<Disposer>(disposer));
+    }
+
    template <typename Container>
    static boost::iterator_range<typename Container::iterator> maybe_reverse(
        Container& c, boost::iterator_range<typename Container::iterator> r)
@@ -93,6 +101,24 @@ struct reversal_traits<true> {
        );
    }

+    // Erases element pointed to by it and makes sure than iterator end is not
+    // invalidated.
+    template<typename Container, typename Disposer>
+    static typename Container::reverse_iterator erase_dispose_and_update_end(Container& c,
+        typename Container::reverse_iterator it, Disposer&& disposer,
+        typename Container::reverse_iterator& end)
+    {
+        auto to_erase = std::next(it).base();
+        bool update_end = end.base() == to_erase;
+        auto ret = typename Container::reverse_iterator(
+            c.erase_and_dispose(to_erase, std::forward<Disposer>(disposer))
+        );
+        if (update_end) {
+            end = ret;
+        }
+        return ret;
+    }
+
    template <typename Container>
    static boost::iterator_range<typename Container::reverse_iterator> maybe_reverse(
        Container& c, boost::iterator_range<typename Container::iterator> r)
@@ -484,7 +510,7 @@ mutation_partition::clustered_row(const schema& s, const clustering_key_view& ke
 }

 mutation_partition::rows_type::const_iterator
-mutation_partition::lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const {
+mutation_partition::lower_bound(const schema& schema, const query::clustering_range& r) const {
    auto cmp = rows_entry::key_comparator(clustering_key_prefix::prefix_equality_less_compare(schema));
    return r.start() ? (r.start()->is_inclusive()
            ? _rows.lower_bound(r.start()->value(), cmp)
@@ -492,7 +518,7 @@ mutation_partition::lower_bound(const schema& schema, const query::range<cluster
 }

 mutation_partition::rows_type::const_iterator
-mutation_partition::upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const {
+mutation_partition::upper_bound(const schema& schema, const query::clustering_range& r) const {
    auto cmp = rows_entry::key_comparator(clustering_key_prefix::prefix_equality_less_compare(schema));
    return r.end() ? (r.end()->is_inclusive()
                         ? _rows.upper_bound(r.end()->value(), cmp)
@@ -500,7 +526,7 @@ mutation_partition::upper_bound(const schema& schema, const query::range<cluster
 }

 boost::iterator_range<mutation_partition::rows_type::const_iterator>
-mutation_partition::range(const schema& schema, const query::range<clustering_key_prefix>& r) const {
+mutation_partition::range(const schema& schema, const query::clustering_range& r) const {
    return boost::make_iterator_range(lower_bound(schema, r), upper_bound(schema, r));
 }

@@ -520,22 +546,22 @@ unconst(Container& c, typename Container::const_iterator i) {
 }

 boost::iterator_range<mutation_partition::rows_type::iterator>
-mutation_partition::range(const schema& schema, const query::range<clustering_key_prefix>& r) {
+mutation_partition::range(const schema& schema, const query::clustering_range& r) {
    return unconst(_rows, static_cast<const mutation_partition*>(this)->range(schema, r));
 }

 mutation_partition::rows_type::iterator
-mutation_partition::lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) {
+mutation_partition::lower_bound(const schema& schema, const query::clustering_range& r) {
    return unconst(_rows, static_cast<const mutation_partition*>(this)->lower_bound(schema, r));
 }

 mutation_partition::rows_type::iterator
-mutation_partition::upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) {
+mutation_partition::upper_bound(const schema& schema, const query::clustering_range& r) {
    return unconst(_rows, static_cast<const mutation_partition*>(this)->upper_bound(schema, r));
 }

 template<typename Func>
-void mutation_partition::for_each_row(const schema& schema, const query::range<clustering_key_prefix>& row_range, bool reversed, Func&& func) const
+void mutation_partition::for_each_row(const schema& schema, const query::clustering_range& row_range, bool reversed, Func&& func) const
 {
    auto r = range(schema, row_range);
    if (!reversed) {
@@ -1136,7 +1162,7 @@ void mutation_partition::trim_rows(const schema& s,
            }

            if (e.empty()) {
-                last = reversal_traits<reversed>::erase_and_dispose(_rows, last, std::next(last, 1), deleter);
+                last = reversal_traits<reversed>::erase_dispose_and_update_end(_rows, last, deleter, end);
            } else {
                ++last;
            }
@@ -1780,8 +1806,9 @@ public:
    }

    void consume_end_of_partition() {
-        _live_rows += _mutation_consumer->consume_end_of_stream();
-        _partitions += 1;
+        auto live_rows_in_partition = _mutation_consumer->consume_end_of_stream();
+        _live_rows += live_rows_in_partition;
+        _partitions += live_rows_in_partition > 0;
    }

    data_query_result consume_end_of_stream() {
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -688,12 +688,12 @@ public:
    tombstone range_tombstone_for_row(const schema& schema, const clustering_key& key) const;
    tombstone tombstone_for_row(const schema& schema, const clustering_key& key) const;
    tombstone tombstone_for_row(const schema& schema, const rows_entry& e) const;
-    boost::iterator_range<rows_type::const_iterator> range(const schema& schema, const query::range<clustering_key_prefix>& r) const;
-    rows_type::const_iterator lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const;
-    rows_type::const_iterator upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const;
-    rows_type::iterator lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r);
-    rows_type::iterator upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r);
-    boost::iterator_range<rows_type::iterator> range(const schema& schema, const query::range<clustering_key_prefix>& r);
+    boost::iterator_range<rows_type::const_iterator> range(const schema& schema, const query::clustering_range& r) const;
+    rows_type::const_iterator lower_bound(const schema& schema, const query::clustering_range& r) const;
+    rows_type::const_iterator upper_bound(const schema& schema, const query::clustering_range& r) const;
+    rows_type::iterator lower_bound(const schema& schema, const query::clustering_range& r);
+    rows_type::iterator upper_bound(const schema& schema, const query::clustering_range& r);
+    boost::iterator_range<rows_type::iterator> range(const schema& schema, const query::clustering_range& r);
    // Writes this partition using supplied query result writer.
    // The partition should be first compacted with compact_for_query(), otherwise
    // results may include data which is deleted/expired.
@@ -714,5 +714,5 @@ public:
        gc_clock::time_point query_time = gc_clock::time_point::min()) const;
 private:
    template<typename Func>
-    void for_each_row(const schema& schema, const query::range<clustering_key_prefix>& row_range, bool reversed, Func&& func) const;
+    void for_each_row(const schema& schema, const query::clustering_range& row_range, bool reversed, Func&& func) const;
 };
--- a/mutation_reader.hh
+++ b/mutation_reader.hh
@@ -33,6 +33,13 @@
 // marking the end of iteration. After calling mutation_reader's operator(),
 // caller must keep the object alive until the returned future is fulfilled.
 //
+// streamed_mutation object emitted by mutation_reader remains valid after the
+// destruction of the mutation_reader.
+//
+// Asking mutation_reader for another streamed_mutation (i.e. invoking
+// mutation_reader::operator()) invalidates all streamed_mutation objects
+// previously produced by that reader.
+//
 // The mutations returned have strictly monotonically increasing keys. Two
 // consecutive mutations never have equal keys.
 //
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -36,7 +36,8 @@ static void remove_or_mark_as_unique_owner(partition_version* current)
 }

 partition_version::partition_version(partition_version&& pv) noexcept
-    : _backref(pv._backref)
+    : anchorless_list_base_hook(std::move(pv))
+    , _backref(pv._backref)
    , _partition(std::move(pv._partition))
 {
    if (_backref) {
@@ -62,29 +63,37 @@ partition_version::~partition_version()
 }

 partition_snapshot::~partition_snapshot() {
-    if (_version) {
+    if (_version && _version.is_unique_owner()) {
        auto v = &*_version;
-        if (_version.is_unique_owner()) {
-            _version = { };
-            remove_or_mark_as_unique_owner(v);
-        } else {
-            _version = { };
-            auto first_used = v;
-            while (first_used->prev() && !first_used->is_referenced()) {
-                first_used = first_used->prev();
-            }
+        _version = {};
+        remove_or_mark_as_unique_owner(v);
+    } else if (_entry) {
+        _entry->_snapshot = nullptr;
+    }
+}

-            auto current = first_used->next();
-            while (current && !current->is_referenced()) {
-                auto next = current->next();
+void partition_snapshot::merge_partition_versions() {
+    if (_version && !_version.is_unique_owner()) {
+        auto v = &*_version;
+        _version = { };
+        auto first_used = v;
+        while (first_used->prev() && !first_used->is_referenced()) {
+            first_used = first_used->prev();
+        }
+
+        auto current = first_used->next();
+        while (current && !current->is_referenced()) {
+            auto next = current->next();
+            try {
                first_used->partition().apply(*_schema, std::move(current->partition()));
                current_allocator().destroy(current);
-                current = next;
+            } catch (...) {
+                // Set _version so that the merge can be retried.
+                _version = partition_version_ref(*current);
+                throw;
            }
+            current = next;
        }
-    } else {
-        assert(_entry);
-        _entry->_snapshot = nullptr;
    }
 }

@@ -308,9 +317,20 @@ partition_snapshot_reader::partition_snapshot_reader(schema_ptr s, dht::decorate

 partition_snapshot_reader::~partition_snapshot_reader()
 {
+    if (!_snapshot.owned()) {
+        return;
+    }
+    // If no one else is using this particular snapshot try to merge partition
+    // versions.
    with_allocator(_lsa_region.allocator(), [this] {
-        logalloc::reclaim_lock _(_lsa_region);
-        _snapshot = { };
+        return with_linearized_managed_bytes([this] {
+            try {
+                _read_section(_lsa_region, [this] {
+                    _snapshot->merge_partition_versions();
+                    _snapshot = {};
+                });
+            } catch (...) { }
+        });
    });
 }

--- a/partition_version.hh
+++ b/partition_version.hh
@@ -192,6 +192,11 @@ public:
    partition_snapshot& operator=(const partition_snapshot&) = delete;
    partition_snapshot& operator=(partition_snapshot&&) = delete;

+    // If possible merges the version pointed to by this snapshot with
+    // adjacent partition versions. Leaves the snapshot in an unspecified state.
+    // Can be retried if previous merge attempt has failed.
+    void merge_partition_versions();
+
    ~partition_snapshot();

    partition_version_ref& version();
--- a/query-request.hh
+++ b/query-request.hh
@@ -170,7 +170,7 @@ public:
    uint32_t row_limit;
    gc_clock::time_point timestamp;
    std::experimental::optional<tracing::trace_info> trace_info;
-    uint32_t partition_limit;
+    uint32_t partition_limit; // The maximum number of live partitions to return.
    api::timestamp_type read_timestamp; // not serialized
 public:
    read_command(utils::UUID cf_id,
--- a/range_tombstone.cc
+++ b/range_tombstone.cc
@@ -21,46 +21,6 @@

 #include "range_tombstone.hh"

-std::ostream& operator<<(std::ostream& out, const bound_kind k) {
-    switch(k) {
-    case bound_kind::excl_end:
-        return out << "excl end";
-    case bound_kind::incl_start:
-        return out << "incl start";
-    case bound_kind::incl_end:
-        return out << "incl end";
-    case bound_kind::excl_start:
-        return out << "excl start";
-    }
-    abort();
-}
-
-bound_kind invert_kind(bound_kind k) {
-    switch(k) {
-    case bound_kind::excl_start: return bound_kind::incl_end;
-    case bound_kind::incl_start: return bound_kind::excl_end;
-    case bound_kind::excl_end:   return bound_kind::incl_start;
-    case bound_kind::incl_end:   return bound_kind::excl_start;
-    }
-    abort();
-}
-
-int32_t weight(bound_kind k) {
-    switch(k) {
-    case bound_kind::excl_end:
-        return -2;
-    case bound_kind::incl_start:
-        return -1;
-    case bound_kind::incl_end:
-        return 1;
-    case bound_kind::excl_start:
-        return 2;
-    }
-    abort();
-}
-
-const thread_local clustering_key_prefix bound_view::empty_prefix = clustering_key::make_empty();
-
 std::ostream& operator<<(std::ostream& out, const range_tombstone& rt) {
    if (rt) {
        return out << "{range_tombstone: start=" << rt.start_bound() << ", end=" << rt.end_bound() << ", " << rt.tomb << "}";
@@ -119,4 +79,4 @@ void range_tombstone_accumulator::clear() {
    _range_tombstones.clear();
    _partition_tombstone = { };
    _current_tombstone = { };
-}
+}
--- a/range_tombstone.hh
+++ b/range_tombstone.hh
@@ -27,95 +27,11 @@
 #include "hashing.hh"
 #include "keys.hh"
 #include "tombstone.hh"
+#include "clustering_bounds_comparator.hh"

 namespace bi = boost::intrusive;
 namespace stdx = std::experimental;

-/**
- * Represents the kind of bound in a range tombstone.
- */
-enum class bound_kind : uint8_t {
-    excl_end = 0,
-    incl_start = 1,
-    // values 2 to 5 are reserved for forward Origin compatibility
-    incl_end = 6,
-    excl_start = 7,
-};
-
-std::ostream& operator<<(std::ostream& out, const bound_kind k);
-
-bound_kind invert_kind(bound_kind k);
-int32_t weight(bound_kind k);
-
-static inline bound_kind flip_bound_kind(bound_kind bk)
-{
-    switch (bk) {
-    case bound_kind::excl_end: return bound_kind::excl_start;
-    case bound_kind::incl_end: return bound_kind::incl_start;
-    case bound_kind::excl_start: return bound_kind::excl_end;
-    case bound_kind::incl_start: return bound_kind::incl_end;
-    }
-    abort();
-}
-
-class bound_view {
-    const static thread_local clustering_key empty_prefix;
-public:
-    const clustering_key_prefix& prefix;
-    bound_kind kind;
-    bound_view(const clustering_key_prefix& prefix, bound_kind kind)
-        : prefix(prefix)
-        , kind(kind)
-    { }
-    struct compare {
-        // To make it assignable and to avoid taking a schema_ptr, we
-        // wrap the schema reference.
-        std::reference_wrapper<const schema> _s;
-        compare(const schema& s) : _s(s)
-        { }
-        bool operator()(const clustering_key_prefix& p1, int32_t w1, const clustering_key_prefix& p2, int32_t w2) const {
-            auto type = _s.get().clustering_key_prefix_type();
-            auto res = prefix_equality_tri_compare(type->types().begin(),
-                type->begin(p1), type->end(p1),
-                type->begin(p2), type->end(p2),
-                tri_compare);
-            if (res) {
-                return res < 0;
-            }
-            auto d1 = p1.size(_s);
-            auto d2 = p2.size(_s);
-            if (d1 == d2) {
-                return w1 < w2;
-            }
-            return d1 < d2 ? w1 <= 0 : w2 > 0;
-        }
-        bool operator()(const bound_view b, const clustering_key_prefix& p) const {
-            return operator()(b.prefix, weight(b.kind), p, 0);
-        }
-        bool operator()(const clustering_key_prefix& p, const bound_view b) const {
-            return operator()(p, 0, b.prefix, weight(b.kind));
-        }
-        bool operator()(const bound_view b1, const bound_view b2) const {
-            return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
-        }
-    };
-    bool equal(const schema& s, const bound_view other) const {
-        return kind == other.kind && prefix.equal(s, other.prefix);
-    }
-    bool adjacent(const schema& s, const bound_view other) const {
-        return invert_kind(other.kind) == kind && prefix.equal(s, other.prefix);
-    }
-    static bound_view bottom() {
-        return {empty_prefix, bound_kind::incl_start};
-    }
-    static bound_view top() {
-        return {empty_prefix, bound_kind::incl_end};
-    }
-    friend std::ostream& operator<<(std::ostream& out, const bound_view& b) {
-        return out << "{bound: prefix=" << b.prefix << ", kind=" << b.kind << "}";
-    }
-};
-
 /**
 * Represents a ranged deletion operation. Can be empty.
 */
@@ -294,7 +210,12 @@ public:
        return _current_tombstone;
    }

+    const std::deque<range_tombstone>& range_tombstones_for_row(const clustering_key_prefix& ck) {
+        drop_unneeded_tombstones(ck);
+        return _range_tombstones;
+    }
+
    void apply(const range_tombstone& rt);

    void clear();
-};
+};
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -38,6 +38,22 @@ static logging::logger logger("cache");

 thread_local seastar::thread_scheduling_group row_cache::_update_thread_scheduling_group(1ms, 0.2);

+enum class is_wide_partition { yes, no };
+
+future<is_wide_partition, mutation_opt>
+try_to_read(uint64_t max_cached_partition_size_in_bytes, streamed_mutation_opt&& sm) {
+    if (!sm) {
+        return make_ready_future<is_wide_partition, mutation_opt>(is_wide_partition::no, mutation_opt());
+    }
+    return mutation_from_streamed_mutation_with_limit(std::move(*sm), max_cached_partition_size_in_bytes).then(
+        [] (mutation_opt&& omo) mutable {
+            if (omo) {
+                return make_ready_future<is_wide_partition, mutation_opt>(is_wide_partition::no, std::move(omo));
+            } else {
+                return make_ready_future<is_wide_partition, mutation_opt>(is_wide_partition::yes, mutation_opt());
+            }
+        });
+}

 cache_tracker& global_cache_tracker() {
    static thread_local cache_tracker instance;
@@ -59,7 +75,7 @@ cache_tracker::cache_tracker() {
            cache_entry& ce = _lru.back();
            auto it = row_cache::partitions_type::s_iterator_to(ce);
            --it;
-            it->set_continuous(false);
+            clear_continuity(*it);
            _lru.pop_back_and_dispose(current_deleter<cache_entry>());
            --_partitions;
            ++_evictions;
@@ -103,6 +119,11 @@ cache_tracker::setup_collectd() {
                , "total_operations", "misses")
                , scollectd::make_typed(scollectd::data_type::DERIVE, _misses)
        ),
+        scollectd::add_polled_metric(scollectd::type_instance_id("cache"
+                , scollectd::per_cpu_plugin_instance
+                , "total_operations", "uncached_wide_partitions")
+                , scollectd::make_typed(scollectd::data_type::DERIVE, _uncached_wide_partitions)
+        ),
        scollectd::add_polled_metric(scollectd::type_instance_id("cache"
                , scollectd::per_cpu_plugin_instance
                , "total_operations", "insertions")
@@ -142,7 +163,7 @@ void cache_tracker::clear() {
                _lru.erase(_lru.iterator_to(to_remove));
                current_deleter<cache_entry>()(&to_remove);
            }
-            it->set_continuous(false);
+            clear_continuity(*it);
        }
    });
    _removals += _partitions;
@@ -180,6 +201,14 @@ void cache_tracker::on_miss() {
    ++_misses;
 }

+void cache_tracker::on_uncached_wide_partition() {
+    ++_uncached_wide_partitions;
+}
+
+void cache_tracker::on_continuity_flag_cleared() {
+    ++_continuity_flags_cleared;
+}
+
 allocation_strategy& cache_tracker::allocator() {
    return _region.allocator();
 }
@@ -196,33 +225,60 @@ const logalloc::region& cache_tracker::region() const {
 class single_partition_populating_reader final : public mutation_reader::impl {
    schema_ptr _schema;
    row_cache& _cache;
+    mutation_source& _underlying;
    mutation_reader _delegate;
+    const io_priority_class _pc;
    query::clustering_key_filtering_context _ck_filtering;
+    query::partition_range _large_partition_range;
+    mutation_reader _large_partition_reader;
 public:
-    single_partition_populating_reader(schema_ptr s, row_cache& cache, mutation_reader delegate, query::clustering_key_filtering_context ck_filtering)
+    single_partition_populating_reader(schema_ptr s, row_cache& cache, mutation_source& underlying,
+        mutation_reader delegate, const io_priority_class pc, query::clustering_key_filtering_context ck_filtering)
        : _schema(std::move(s))
        , _cache(cache)
+        , _underlying(underlying)
        , _delegate(std::move(delegate))
+        , _pc(pc)
        , _ck_filtering(ck_filtering)
    { }

    virtual future<streamed_mutation_opt> operator()() override {
-        return _delegate().then([] (auto sm) {
-            return mutation_from_streamed_mutation(std::move(sm));
-        }).then([this, op = _cache._populate_phaser.start()] (mutation_opt&& mo) -> streamed_mutation_opt {
-            if (mo) {
-                _cache.populate(*mo);
-                mo->upgrade(_schema);
-                auto& ck_ranges = _ck_filtering.get_ranges(mo->key());
-                auto filtered_partition = mutation_partition(std::move(mo->partition()), *(mo->schema()), ck_ranges);
-                mo->partition() = std::move(filtered_partition);
-                return streamed_mutation_from_mutation(std::move(*mo));
+        auto op = _cache._populate_phaser.start();
+        return _delegate().then([this, op = std::move(op)] (auto sm) mutable {
+            if (!sm) {
+                return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
            }
-            return { };
+            dht::decorated_key dk = sm->decorated_key();
+            return try_to_read(_cache._max_cached_partition_size_in_bytes, std::move(sm)).then(
+                [this, op = std::move(op), dk = std::move(dk)]
+                (is_wide_partition wide_partition, mutation_opt&& mo) {
+                    if (wide_partition == is_wide_partition::no) {
+                        if (mo) {
+                            _cache.populate(*mo);
+                            mo->upgrade(_schema);
+                            auto& ck_ranges = _ck_filtering.get_ranges(mo->key());
+                            auto filtered_partition = mutation_partition(std::move(mo->partition()), *(mo->schema()), ck_ranges);
+                            mo->partition() = std::move(filtered_partition);
+                            return make_ready_future<streamed_mutation_opt>(streamed_mutation_from_mutation(std::move(*mo)));
+                        }
+                        return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
+                    } else {
+                        _cache.on_uncached_wide_partition();
+                        _cache.mark_partition_as_wide(dk);
+                        _large_partition_range = query::partition_range::make_singular(std::move(dk));
+                        _large_partition_reader = _underlying(_schema, _large_partition_range, _ck_filtering, _pc);
+                        return _large_partition_reader();
+                    }
+                });
        });
    }
 };

+void cache_tracker::clear_continuity(cache_entry& ce) {
+    ce.set_continuous(false);
+    on_continuity_flag_cleared();
+}
+
 void row_cache::on_hit() {
    _stats.hits.mark();
    _tracker.on_hit();
@@ -233,6 +289,10 @@ void row_cache::on_miss() {
    _tracker.on_miss();
 }

+void row_cache::on_uncached_wide_partition() {
+    _tracker.on_uncached_wide_partition();
+}
+
 class just_cache_scanning_reader final {
    schema_ptr _schema;
    row_cache& _cache;
@@ -243,6 +303,7 @@ class just_cache_scanning_reader final {
    uint64_t _last_reclaim_count;
    size_t _last_modification_count;
    query::clustering_key_filtering_context _ck_filtering;
+    const io_priority_class _pc;
 private:
    void update_iterators() {
        auto cmp = cache_entry::compare(_cache._schema);
@@ -285,10 +346,12 @@ private:
 public:
    struct cache_data {
        streamed_mutation_opt mut;
+        uint64_t continuity_flags_cleared;
        bool continuous;
    };
-    just_cache_scanning_reader(schema_ptr s, row_cache& cache, const query::partition_range& range, query::clustering_key_filtering_context ck_filtering)
-        : _schema(std::move(s)), _cache(cache), _range(range), _ck_filtering(ck_filtering)
+    just_cache_scanning_reader(schema_ptr s, row_cache& cache, const query::partition_range& range,
+                               query::clustering_key_filtering_context ck_filtering, const io_priority_class& pc)
+        : _schema(std::move(s)), _cache(cache), _range(range), _ck_filtering(ck_filtering), _pc(pc)
    { }
    future<cache_data> operator()() {
        return _cache._read_section(_cache._tracker.region(), [this] {
@@ -301,8 +364,19 @@ public:
            ++_it;
            _last = ce.key();
            _cache.upgrade_entry(ce);
-            cache_data data{std::move(ce.read(_cache, _schema, _ck_filtering)), ce.continuous()};
-            return make_ready_future<cache_data>(std::move(data));
+            cache_data cd { { }, _cache._tracker.continuity_flags_cleared(), ce.continuous() };
+            if (ce.wide_partition()) {
+                return ce.read_wide(_cache, _schema, _ck_filtering, _pc).then([this, cd = std::move(cd)] (auto smopt) mutable {
+                    if (smopt) {
+                        cd.mut = std::move(*smopt);
+                    } else {
+                        cd.mut = streamed_mutation_from_mutation(mutation(_last->as_decorated_key(), _schema));
+                    }
+                    return std::move(cd);
+                });
+            }
+            cd.mut = ce.read(_cache, _schema, _ck_filtering);
+            return make_ready_future<cache_data>(std::move(cd));
          });
        });
    }
@@ -346,6 +420,8 @@ class range_populating_reader final : public mutation_reader::impl {
    std::experimental::optional<dht::ring_position> _last_key;
    utils::phased_barrier::phase_type _last_key_populate_phase;
    mark_end_as_continuous _make_last_entry_continuous;
+    query::partition_range _large_partition_range;
+    mutation_reader _large_partition_reader;

    void update_reader() {
        if (_populate_phase != _cache._populate_phaser.phase()) {
@@ -372,6 +448,12 @@ class range_populating_reader final : public mutation_reader::impl {
            });
        }
    }
+
+    void update_last_key(const dht::decorated_key& key) {
+        this->maybe_mark_last_entry_as_continuous(mark_end_as_continuous(mark_end_as_continuous::override(), true));
+        _last_key = dht::ring_position(key);
+        _last_key_populate_phase = _cache._populate_phaser.phase();
+    }
 public:
    range_populating_reader(
        row_cache& cache,
@@ -397,22 +479,44 @@ public:
        {}
    virtual future<streamed_mutation_opt> operator()() override {
        update_reader();
-        return _reader().then([] (auto sm) {
-            return mutation_from_streamed_mutation(std::move(sm));
-        }).then([this, op = _cache._populate_phaser.start()] (mutation_opt&& mo) -> streamed_mutation_opt {
-            if (mo) {
-                _cache.populate(*mo);
-                mo->upgrade(_schema);
-                maybe_mark_last_entry_as_continuous(mark_end_as_continuous(mark_end_as_continuous::override(), true));
-                _last_key = dht::ring_position(mo->decorated_key());
-                _last_key_populate_phase = _cache._populate_phaser.phase();
-                auto& ck_ranges = _ck_filtering.get_ranges(mo->key());
-                auto filtered_partition = mutation_partition(std::move(mo->partition()), *(mo->schema()), ck_ranges);
-                mo->partition() = std::move(filtered_partition);
-                return streamed_mutation_from_mutation(std::move(*mo));
-            }
-            maybe_mark_last_entry_as_continuous(_make_last_entry_continuous);
-            return {};
+        auto op = _cache._populate_phaser.start();
+        return _reader().then([this, op = std::move(op)] (auto sm) mutable {
+            stdx::optional<dht::decorated_key> dk = (sm) ? stdx::optional<dht::decorated_key>(sm->decorated_key())
+                                                         : stdx::optional<dht::decorated_key>(stdx::nullopt);
+            return try_to_read(_cache._max_cached_partition_size_in_bytes, std::move(sm)).then(
+                [this, op = std::move(op), dk = std::move(dk)]
+                (is_wide_partition wide_partition, mutation_opt&& mo) mutable {
+                    if (wide_partition == is_wide_partition::no) {
+                        if (mo) {
+                            _cache.populate(*mo);
+                            mo->upgrade(_schema);
+                            this->update_last_key(mo->decorated_key());
+                            auto& ck_ranges = _ck_filtering.get_ranges(mo->key());
+                            auto filtered_partition = mutation_partition(std::move(mo->partition()), *(mo->schema()), ck_ranges);
+                            mo->partition() = std::move(filtered_partition);
+                            return make_ready_future<streamed_mutation_opt>(streamed_mutation_from_mutation(std::move(*mo)));
+                        }
+                        this->maybe_mark_last_entry_as_continuous(_make_last_entry_continuous);
+                        return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
+                    } else {
+                        assert(bool(dk));
+                        this->update_last_key(*dk);
+                        _cache.on_uncached_wide_partition();
+                        _cache.mark_partition_as_wide(*dk);
+                        _large_partition_range = query::partition_range::make_singular(*dk);
+                        _large_partition_reader = _underlying(_schema, _large_partition_range, _ck_filtering, _pc);
+                        return _large_partition_reader().then([this, dk = std::move(*dk)] (auto smopt) mutable -> streamed_mutation_opt {
+                            _large_partition_reader = {};
+                            if (!smopt) {
+                                // We cannot emit disengaged optional since this is a part of range
+                                // read and it would incorrectly interpreted as end of stream.
+                                // Produce empty mutation instead.
+                                return streamed_mutation_from_mutation(mutation(std::move(dk), _schema));
+                            }
+                            return smopt;
+                        });
+                    }
+                });
        });
    }
 };
@@ -454,6 +558,7 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
        just_cache_scanning_reader _primary;
        last_key _last_key_from_primary;
        utils::phased_barrier::phase_type _last_key_from_primary_populate_phase;
+        uint64_t _last_key_from_primary_continuity_flags_cleared;
        query::clustering_key_filtering_context _ck_filtering;
        boost::variant<end_state,
                       secondary_only_state,
@@ -466,6 +571,7 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
            if (!bound_opt) {
                _last_key_from_primary = {_cache._partitions.begin()->key(), true};
                _last_key_from_primary_populate_phase = _cache._populate_phaser.phase();
+                _last_key_from_primary_continuity_flags_cleared = _cache._tracker.continuity_flags_cleared();
                return _cache._partitions.begin()->continuous();
            }
            const range_bound<dht::ring_position>& bound = bound_opt.value();
@@ -481,6 +587,7 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
                    (!bound.is_inclusive() || bound.value().relation_to_keys() == -1)) {
                    _last_key_from_primary = {i->key(), true};
                    _last_key_from_primary_populate_phase = _cache._populate_phaser.phase();
+                    _last_key_from_primary_continuity_flags_cleared = _cache._tracker.continuity_flags_cleared();
                    return i->continuous();
                }
                --i;
@@ -512,6 +619,9 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
            // We have to capture mutation from data before we change the state because data lives in state
            // and changing state destroys previous state.
            streamed_mutation_opt result = std::move(data.mut);
+            if (_cache._tracker.continuity_flags_cleared() != data.continuity_flags_cleared) {
+                data.continuous = _cache.has_continuous_entry(*_last_key_from_primary.value);
+            }
            if (data.continuous) {
                _state = after_continuous_entry_state{};
            } else {
@@ -545,7 +655,7 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
            , _schema(std::move(s))
            , _range(range)
            , _pc(pc)
-            , _primary(_schema, _cache, _range, ck_filtering)
+            , _primary(_schema, _cache, _range, ck_filtering, pc)
            , _ck_filtering(ck_filtering)
            , _state(start_state{}) {}
        future<streamed_mutation_opt> operator()(const end_state& state) {
@@ -580,6 +690,11 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
            });
        }
        future<streamed_mutation_opt> operator()(after_continuous_entry_state& state) {
+            if (_last_key_from_primary_continuity_flags_cleared != _cache._tracker.continuity_flags_cleared()
+                && !_cache.has_continuous_entry(*_last_key_from_primary.value)) {
+                _state = after_not_continuous_entry_state{};
+                return operator()();
+            }
            return _primary().then([this] (just_cache_scanning_reader::cache_data&& data) {
                if (!data.mut) {
                    switch_to_end();
@@ -675,11 +790,15 @@ row_cache::make_reader(schema_ptr s,
                _tracker.touch(e);
                on_hit();
                upgrade_entry(e);
+                if (e.wide_partition()) {
+                    _tracker.on_uncached_wide_partition();
+                    return _underlying(s, range, ck_filtering, pc);
+                }
                return make_reader_returning(e.read(*this, s, ck_filtering));
            } else {
                on_miss();
-                return make_mutation_reader<single_partition_populating_reader>(s, *this,
-                    _underlying(_schema, range, query::no_clustering_key_filtering, pc),
+                return make_mutation_reader<single_partition_populating_reader>(s, *this, _underlying,
+                    _underlying(_schema, range, query::no_clustering_key_filtering, pc), pc,
                    ck_filtering);
            }
          });
@@ -708,7 +827,25 @@ void row_cache::clear_now() noexcept {
                deleter(p);
            });
        }
-        _partitions.begin()->set_continuous(false);
+        _tracker.clear_continuity(*_partitions.begin());
+    });
+}
+
+void row_cache::mark_partition_as_wide(const dht::decorated_key& key) {
+    with_allocator(_tracker.allocator(), [this, &key] {
+        _populate_section(_tracker.region(), [&] {
+            with_linearized_managed_bytes([&] {
+                auto i = _partitions.lower_bound(key, cache_entry::compare(_schema));
+                if (i == _partitions.end() || !i->key().equal(*_schema, key)) {
+                    cache_entry* entry = current_allocator().construct<cache_entry>(
+                            _schema, key, cache_entry::wide_partition_tag{});
+                    _tracker.insert(*entry);
+                    _partitions.insert(i, *entry);
+                } else {
+                    i->set_wide_partition();
+                }
+            });
+        });
    });
 }

@@ -783,11 +920,13 @@ future<> row_cache::update(memtable& m, partition_presence_checker presence_chec
                            // FIXME: keep a bitmap indicating which sstables we do cover, so we don't have to
                            //        search it.
                            if (cache_i != _partitions.end() && cache_i->key().equal(*_schema, mem_e.key())) {
+                              if (!cache_i->wide_partition()) {
                                cache_entry& entry = *cache_i;
                                upgrade_entry(entry);
                                entry.partition().apply(*_schema, std::move(mem_e.partition()), *mem_e.schema());
                                _tracker.touch(entry);
                                _tracker.on_merge();
+                              }
                            } else if (presence_checker(mem_e.key().key()) ==
                                    partition_presence_checker_result::definitely_doesnt_exist) {
                                cache_entry* entry = current_allocator().construct<cache_entry>(
@@ -796,7 +935,7 @@ future<> row_cache::update(memtable& m, partition_presence_checker presence_chec
                                _partitions.insert(cache_i, *entry);
                            } else {
                                --cache_i;
-                                cache_i->set_continuous(false);
+                                _tracker.clear_continuity(*cache_i);
                            }
                            i = m.partitions.erase(i);
                            current_allocator().destroy(&mem_e);
@@ -832,10 +971,10 @@ void row_cache::touch(const dht::decorated_key& dk) {
 void row_cache::invalidate_locked(const dht::decorated_key& dk) {
    auto pos = _partitions.lower_bound(dk, cache_entry::compare(_schema));
    if (pos == _partitions.end()) {
-        _partitions.rbegin()->set_continuous(false);
+        _tracker.clear_continuity(*_partitions.rbegin());
    } else if (!pos->key().equal(*_schema, dk)) {
        --pos;
-        pos->set_continuous(false);
+        _tracker.clear_continuity(*pos);
    } else {
        auto end = pos;
        ++end;
@@ -846,7 +985,7 @@ void row_cache::invalidate_locked(const dht::decorated_key& dk) {
            });
        assert (it != _partitions.begin());
        --it;
-        it->set_continuous(false);
+        _tracker.clear_continuity(*it);
    }
 }

@@ -907,17 +1046,32 @@ void row_cache::invalidate_unwrapped(const query::partition_range& range) {
        });
        assert(it != _partitions.begin());
        --it;
-        it->set_continuous(false);
+        _tracker.clear_continuity(*it);
+    });
+}
+
+bool row_cache::has_continuous_entry(const dht::ring_position& key) const {
+    return with_linearized_managed_bytes([&] {
+        auto i = _partitions.lower_bound(key, cache_entry::compare(_schema));
+        if (i == _partitions.end()) {
+            return _partitions.rbegin()->continuous();
+        }
+        if (!i->key().equal(*_schema, key)) {
+            --i;
+            return i->continuous();
+        }
+        return i->continuous();
    });
 }

 row_cache::row_cache(schema_ptr s, mutation_source fallback_factory, key_source underlying_keys,
-    cache_tracker& tracker)
+    cache_tracker& tracker, uint64_t max_cached_partition_size_in_bytes)
    : _tracker(tracker)
    , _schema(std::move(s))
    , _partitions(cache_entry::compare(_schema))
    , _underlying(std::move(fallback_factory))
    , _underlying_keys(std::move(underlying_keys))
+    , _max_cached_partition_size_in_bytes(max_cached_partition_size_in_bytes)
 {
    with_allocator(_tracker.allocator(), [this] {
        cache_entry* entry = current_allocator().construct<cache_entry>(_schema);
@@ -930,6 +1084,7 @@ cache_entry::cache_entry(cache_entry&& o) noexcept
    , _key(std::move(o._key))
    , _pe(std::move(o._pe))
    , _continuous(o._continuous)
+    , _wide_partition(o._wide_partition)
    , _lru_link()
    , _cache_link()
 {
@@ -950,11 +1105,29 @@ void row_cache::set_schema(schema_ptr new_schema) noexcept {
    _schema = std::move(new_schema);
 }

+future<streamed_mutation_opt> cache_entry::read_wide(row_cache& rc, schema_ptr s, query::clustering_key_filtering_context ck_filtering, const io_priority_class& pc) {
+    struct range_and_underlyig_reader {
+        query::partition_range _range;
+        mutation_reader _reader;
+        range_and_underlyig_reader(row_cache& rc, schema_ptr s, query::partition_range pr,
+                                   query::clustering_key_filtering_context ck_filtering, const io_priority_class& pc)
+                : _range(std::move(pr))
+                  , _reader(rc._underlying(s, _range, ck_filtering, pc))
+        { }
+    };
+    rc._tracker.on_uncached_wide_partition();
+    auto pr = query::partition_range::make_singular(_key);
+    return do_with(range_and_underlyig_reader(rc, s, std::move(pr), std::move(ck_filtering), pc), [] (auto& r_a_ur) {
+        return r_a_ur._reader();
+    });
+}
+
 streamed_mutation cache_entry::read(row_cache& rc, const schema_ptr& s) {
    return read(rc, s, query::no_clustering_key_filtering);
 }

 streamed_mutation cache_entry::read(row_cache& rc, const schema_ptr& s, query::clustering_key_filtering_context ck_filtering) {
+    assert(!wide_partition());
    auto dk = _key.as_decorated_key();
    if (_schema->version() != s->version()) {
        const query::clustering_row_ranges& ck_ranges = ck_filtering.get_ranges(dk.key());
@@ -973,6 +1146,10 @@ const schema_ptr& row_cache::schema() const {

 void row_cache::upgrade_entry(cache_entry& e) {
    if (e._schema != _schema) {
+        if (e.wide_partition()) {
+            e._schema = _schema;
+            return;
+        }
        auto& r = _tracker.region();
        assert(!r.reclaiming_enabled());
        with_allocator(r.allocator(), [this, &e] {
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -62,7 +62,8 @@ class cache_entry {
    dht::ring_position _key;
    partition_entry _pe;
    // True when we know that there is nothing between this entry and the next one in cache
-    bool _continuous;
+    bool _continuous : 1;
+    bool _wide_partition : 1;
    lru_link_type _lru_link;
    cache_link_type _cache_link;
    friend class size_calculator;
@@ -73,8 +74,17 @@ public:
    cache_entry(schema_ptr s)
        : _schema(std::move(s))
        , _key(dht::ring_position::starting_at(dht::minimum_token()))
-        , _pe(_schema)
        , _continuous(false)
+        , _wide_partition(false)
+    { }
+
+    struct wide_partition_tag{};
+
+    cache_entry(schema_ptr s, const dht::decorated_key& key, wide_partition_tag)
+        : _schema(std::move(s))
+        , _key(key)
+        , _continuous(false)
+        , _wide_partition(true)
    { }

    cache_entry(schema_ptr s, const dht::decorated_key& key, const mutation_partition& p, bool continuous = false)
@@ -82,6 +92,7 @@ public:
        , _key(key)
        , _pe(p)
        , _continuous(continuous)
+        , _wide_partition(false)
    { }

    cache_entry(schema_ptr s, dht::decorated_key&& key, mutation_partition&& p, bool continuous = false) noexcept
@@ -89,6 +100,7 @@ public:
        , _key(std::move(key))
        , _pe(std::move(p))
        , _continuous(continuous)
+        , _wide_partition(false)
    { }

    cache_entry(schema_ptr s, dht::decorated_key&& key, partition_entry&& pe, bool continuous = false) noexcept
@@ -96,6 +108,7 @@ public:
        , _key(std::move(key))
        , _pe(std::move(pe))
        , _continuous(continuous)
+        , _wide_partition(false)
    { }

    cache_entry(cache_entry&&) noexcept;
@@ -106,10 +119,19 @@ public:
    partition_entry& partition() { return _pe; }
    const schema_ptr& schema() const { return _schema; }
    schema_ptr& schema() { return _schema; }
+    // Requires: !wide_partition()
    streamed_mutation read(row_cache&, const schema_ptr&);
+    // Requires: !wide_partition()
    streamed_mutation read(row_cache&, const schema_ptr&, query::clustering_key_filtering_context);
+    // May return disengaged optional if the partition is empty.
+    future<streamed_mutation_opt> read_wide(row_cache&, schema_ptr, query::clustering_key_filtering_context, const io_priority_class&);
    bool continuous() const { return _continuous; }
    void set_continuous(bool value) { _continuous = value; }
+    bool wide_partition() const { return _wide_partition; }
+    void set_wide_partition() {
+        _wide_partition = true;
+        _pe = {};
+    }

    struct compare {
        dht::ring_position_less_comparator _c;
@@ -149,12 +171,14 @@ public:
 private:
    uint64_t _hits = 0;
    uint64_t _misses = 0;
+    uint64_t _uncached_wide_partitions = 0;
    uint64_t _insertions = 0;
    uint64_t _merges = 0;
    uint64_t _evictions = 0;
    uint64_t _removals = 0;
    uint64_t _partitions = 0;
    uint64_t _modification_count = 0;
+    uint64_t _continuity_flags_cleared = 0;
    std::unique_ptr<scollectd::registrations> _collectd_registrations;
    logalloc::region _region;
    lru_type _lru;
@@ -166,15 +190,20 @@ public:
    void clear();
    void touch(cache_entry&);
    void insert(cache_entry&);
+    void clear_continuity(cache_entry& ce);
    void on_erase();
    void on_merge();
    void on_hit();
    void on_miss();
+    void on_uncached_wide_partition();
+    void on_continuity_flag_cleared();
    allocation_strategy& allocator();
    logalloc::region& region();
    const logalloc::region& region() const;
    uint64_t modification_count() const { return _modification_count; }
    uint64_t partitions() const { return _partitions; }
+    uint64_t uncached_wide_partitions() const { return _uncached_wide_partitions; }
+    uint64_t continuity_flags_cleared() const { return _continuity_flags_cleared; }
 };

 // Returns a reference to shard-wide cache_tracker.
@@ -211,6 +240,7 @@ private:
    partitions_type _partitions; // Cached partitions are complete.
    mutation_source _underlying;
    key_source _underlying_keys;
+    uint64_t _max_cached_partition_size_in_bytes;

    // Synchronizes populating reads with updates of underlying data source to ensure that cache
    // remains consistent across flushes with the underlying data source.
@@ -231,6 +261,7 @@ private:
                                         query::clustering_key_filtering_context ck_filtering);
    void on_hit();
    void on_miss();
+    void on_uncached_wide_partition();
    void upgrade_entry(cache_entry&);
    void invalidate_locked(const dht::decorated_key&);
    void invalidate_unwrapped(const query::partition_range&);
@@ -238,7 +269,7 @@ private:
    static thread_local seastar::thread_scheduling_group _update_thread_scheduling_group;
 public:
    ~row_cache();
-    row_cache(schema_ptr, mutation_source underlying, key_source, cache_tracker&);
+    row_cache(schema_ptr, mutation_source underlying, key_source, cache_tracker&, uint64_t _max_cached_partition_size_in_bytes = 10 * 1024 * 1024);
    row_cache(row_cache&&) = default;
    row_cache(const row_cache&) = delete;
    row_cache& operator=(row_cache&&) = default;
@@ -258,6 +289,9 @@ public:
    // information there is for its partition in the underlying data sources.
    void populate(const mutation& m);

+    // Caches an information that a partition with a given key is wide.
+    void mark_partition_as_wide(const dht::decorated_key& key);
+
    // Clears the cache.
    // Guarantees that cache will not be populated using readers created
    // before this method was invoked.
@@ -289,6 +323,8 @@ public:
    // The range must be kept alive until method resolves.
    future<> invalidate(const query::partition_range&);

+    bool has_continuous_entry(const dht::ring_position& key) const;
+
    auto num_entries() const {
        return _partitions.size();
    }
--- a/schema.cc
+++ b/schema.cc
@@ -56,6 +56,14 @@ sstring to_sstring(index_type t) {
    throw std::invalid_argument("unknown index type");
 }

+bool is_regular(column_kind k) {
+    return k == column_kind::regular_column || k == column_kind::compact_column;
+}
+
+bool is_compatible(column_kind k1, column_kind k2) {
+    return k1 == k2 || (is_regular(k1) && is_regular(k2));
+}
+
 column_mapping_entry::column_mapping_entry(bytes name, sstring type_name)
    : _name(std::move(name))
    , _type(db::marshal::type_parser::parse(type_name))
@@ -635,51 +643,60 @@ schema_builder& schema_builder::with_version(table_schema_version v) {
    return *this;
 }

-schema_ptr schema_builder::build() {
-    if (_version) {
-        _raw._version = *_version;
-    } else {
-        _raw._version = utils::UUID_gen::get_time_UUID();
-    }
+void schema_builder::prepare_dense_schema(schema::raw_schema& raw) {
+    if (raw._is_dense) {
+        auto regular_cols = boost::copy_range<std::vector<column_definition*>>(
+            raw._columns | boost::adaptors::filtered([](auto&& col) { return col.is_regular(); })
+                         | boost::adaptors::transformed([](auto&& col) { return &col; }));

-    if (!_compact_storage) {
-        return make_lw_shared<schema>(schema(_raw));
-    }
-
-    schema s(_raw);
-
-    // Dense means that no part of the comparator stores a CQL column name. This means
-    // COMPACT STORAGE with at least one columnAliases (otherwise it's a thrift "static" CF).
-    s._raw._is_dense = (*_compact_storage == compact_storage::yes) && (s.clustering_key_size() > 0);
-
-    if (s.clustering_key_size() == 0) {
-        if (*_compact_storage == compact_storage::yes) {
-            s._raw._is_compound = false;
-        } else {
-            s._raw._is_compound = true;
-        }
-    } else {
-        if ((*_compact_storage == compact_storage::yes) && s.clustering_key_size() == 1) {
-            s._raw._is_compound = false;
-        } else {
-            s._raw._is_compound = true;
-        }
-    }
-
-    if (s._raw._is_dense) {
        // In Origin, dense CFs always have at least one regular column
-        if (s.regular_columns_count() == 0) {
-            s._raw._columns.emplace_back(bytes(""), s.regular_column_name_type(), column_kind::regular_column, 0, index_info());
+        if (regular_cols.empty()) {
+            raw._columns.emplace_back(bytes(""), raw._regular_column_name_type, column_kind::compact_column, 0, index_info());
+            return;
        }

-        if (s.regular_columns_count() != 1) {
-            throw exceptions::configuration_exception(sprint("Expecting exactly one regular column. Found %d", s.regular_columns_count()));
+        if (regular_cols.size() != 1) {
+            throw exceptions::configuration_exception(sprint("Expecting exactly one regular column. Found %d", regular_cols.size()));
        }
-        s._raw._columns.at(s.column_offset(column_kind::regular_column)).kind = column_kind::compact_column;
+
+        regular_cols[0]->kind = column_kind::compact_column;
    }
-    // We need to rebuild the schema in case we added some column. This is way simpler than trying to factor out the relevant code
-    // from the constructor
-    return make_lw_shared<schema>(schema(s._raw));
+}
+
+schema_ptr schema_builder::build() {
+    schema::raw_schema new_raw = _raw; // Copy so that build() remains idempotent.
+
+    if (_version) {
+        new_raw._version = *_version;
+    } else {
+        new_raw._version = utils::UUID_gen::get_time_UUID();
+    }
+
+    if (_compact_storage) {
+        // Dense means that no part of the comparator stores a CQL column name. This means
+        // COMPACT STORAGE with at least one columnAliases (otherwise it's a thrift "static" CF).
+        auto clustering_key_size = std::count_if(new_raw._columns.begin(), new_raw._columns.end(), [](auto&& col) {
+            return col.kind == column_kind::clustering_key;
+        });
+        new_raw._is_dense = (*_compact_storage == compact_storage::yes) && (clustering_key_size > 0);
+
+        if (clustering_key_size == 0) {
+            if (*_compact_storage == compact_storage::yes) {
+                new_raw._is_compound = false;
+            } else {
+                new_raw._is_compound = true;
+            }
+        } else {
+            if ((*_compact_storage == compact_storage::yes) && clustering_key_size == 1) {
+                new_raw._is_compound = false;
+            } else {
+                new_raw._is_compound = true;
+            }
+        }
+    }
+
+    prepare_dense_schema(new_raw);
+    return make_lw_shared<schema>(schema(new_raw));
 }

 schema_ptr schema_builder::build(compact_storage cp) {
--- a/schema.hh
+++ b/schema.hh
@@ -72,6 +72,8 @@ void read_collections(schema_builder& builder, sstring comparator);
 enum class column_kind { partition_key, clustering_key, static_column, regular_column, compact_column };

 sstring to_sstring(column_kind k);
+bool is_regular(column_kind k);
+bool is_compatible(column_kind k1, column_kind k2);

 // CMH this is also manually defined in thrift gen file.
 enum class index_type {
@@ -225,7 +227,7 @@ public:
    index_info idx_info;

    bool is_static() const { return kind == column_kind::static_column; }
-    bool is_regular() const { return kind == column_kind::regular_column || kind == column_kind::compact_column; }
+    bool is_regular() const { return ::is_regular(kind); }
    bool is_partition_key() const { return kind == column_kind::partition_key; }
    bool is_clustering_key() const { return kind == column_kind::clustering_key; }
    bool is_primary_key() const { return kind == column_kind::partition_key || kind == column_kind::clustering_key; }
--- a/schema_builder.hh
+++ b/schema_builder.hh
@@ -220,4 +220,6 @@ public:
    schema_ptr build(compact_storage cp);

    schema_ptr build();
+private:
+    void prepare_dense_schema(schema::raw_schema& raw);
 };
--- a/scripts/scylla_current_repo
+++ b/scripts/scylla_current_repo
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+VERSION=$(./SCYLLA-VERSION-GEN)
+SCYLLA_VERSION=$(cat build/SCYLLA-VERSION-FILE)
+SCYLLA_RELEASE=$(cat build/SCYLLA-RELEASE-FILE)
+
+. /etc/os-release
+
+if [ "$SCYLLA_VERSION" = "666.development" ]; then
+    if [ "$ID" = "ubuntu" ]; then
+        CODENAME=`lsb_release -c|awk '{print $2}'`
+        if [ "$CODENAME" = "trusty" ]; then
+            CODENAME=ubuntu
+        fi
+        echo https://downloads.scylladb.com/deb/unstable/$CODENAME/master/latest/scylla.list
+    elif [ "$ID" = "centos" ]; then
+        echo https://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
+    elif [ "$ID" = "fedora" ]; then
+        echo https://downloads.scylladb.com/rpm/unstable/fedora/master/latest/scylla.repo
+    else
+        echo "Unsupported distribution."
+        exit 1
+    fi
+else
+    REPO_VERSION=$(echo $SCYLLA_VERSION |sed -e "s/^\([0-9]*\.[0-9]*\).*/\1/")
+    if [ "$ID" = "ubuntu" ]; then
+        CODENAME=`lsb_release -c|awk '{print $2}'`
+        echo http://downloads.scylladb.com/deb/ubuntu/scylla-$REPO_VERSION-$CODENAME.list
+    elif [ "$ID" = "centos" ]; then
+        echo http://downloads.scylladb.com/rpm/centos/scylla-$REPO_VERSION.repo
+    elif [ "$ID" = "fedora" ]; then
+        echo http://downloads.scylladb.com/rpm/fedora/scylla-$REPO_VERSION.repo
+    else
+        echo "Unsupported distribution."
+        exit 1
+    fi
+fi
--- a/scripts/scylla_install_pkg
+++ b/scripts/scylla_install_pkg
@@ -10,7 +10,7 @@ fi
 print_usage() {
    echo "scylla_install_pkg --local-pkg /home/scylla/rpms --repo [URL]"
    echo "  --local-pkg	install locally built .rpm/.deb on specified directory"
-    echo "  --repo specify repository URL"
+    echo "  --repo specify .repo/.list file URL"
    exit 1
 }

@@ -42,10 +42,8 @@ if [ "$ID" = "ubuntu" ]; then
    chmod +x /usr/sbin/policy-rc.d
    cp /etc/hosts /etc/hosts.orig
    echo 127.0.0.1 `hostname` >> /etc/hosts
-    if [ "$REPO" = "" ]; then
-        echo "deb http://s3.amazonaws.com/downloads.scylladb.com/deb/ubuntu trusty/scylladb multiverse" > /etc/apt/sources.list.d/scylla.list
-    else
-        echo "deb $REPO trusty/scylladb multiverse" > /etc/apt/sources.list.d/scylla.list
+    if [ "$REPO" != "" ]; then
+        curl -o /etc/apt/sources.list.d/scylla.list $REPO
    fi
    apt-get update
    if [ "$LOCAL_PKG" = "" ]; then
@@ -54,27 +52,25 @@ if [ "$ID" = "ubuntu" ]; then
        if [ ! -f /usr/bin/gdebi ]; then
            apt-get install -y --force-yes gdebi-core
        fi
+        echo Y | gdebi $LOCAL_PKG/scylla-kernel-conf*.deb
        echo Y | gdebi $LOCAL_PKG/scylla-conf*.deb
-        echo Y | gdebi $LOCAL_PKG/scylla-server*.deb
+        echo Y | gdebi $LOCAL_PKG/scylla-server_*.deb
+        echo Y | gdebi $LOCAL_PKG/scylla-server-dbg*.deb
        echo Y | gdebi $LOCAL_PKG/scylla-jmx*.deb
        echo Y | gdebi $LOCAL_PKG/scylla-tools*.deb
+        echo Y | gdebi $LOCAL_PKG/scylla_*.deb
    fi
    mv /etc/hosts.orig /etc/hosts
    rm /usr/sbin/policy-rc.d
 else
-    if [ "$ID" = "fedora" ]; then
-        if [ "$REPO" = "" ]; then
-            curl http://downloads.scylladb.com/rpm/fedora/scylla.repo > /etc/yum.repos.d/scylla.repo
-        else
-            curl $REPO > /etc/yum.repos.d/scylla.repo
-        fi
-    elif [ "$ID" = "centos" ] || [ "$ID" = "rhel" ]; then
-        if [ "$REPO" = "" ]; then
-            curl http://downloads.scylladb.com/rpm/centos/scylla.repo > /etc/yum.repos.d/scylla.repo
-        else
-            curl $REPO > /etc/yum.repos.d/scylla.repo
-        fi
-        yum install -y epel-release
+    if [ "$REPO" != "" ]; then
+        curl -o /etc/yum.repos.d/scylla.repo $REPO
+    fi
+
+    if [ "$ID" = "centos" ]; then
+            yum install -y epel-release
+    elif [ "$ID" = "rhel" ]; then
+        rpm -ivh http://download.fedoraproject.org/pub/epel/7/x86_64/e/epel-release-7-7.noarch.rpm
    else
        echo "Unsupported distribution"
        exit 1
@@ -83,6 +79,6 @@ else
    if [ "$LOCAL_PKG" = "" ]; then
        yum install -y scylla
    else
-        yum install -y $LOCAL_PKG/scylla-conf*.x86_64.rpm $LOCAL_PKG/scylla-server*.x86_64.rpm $LOCAL_PKG/scylla-jmx*.noarch.rpm $LOCAL_PKG/scylla-tools*.noarch.rpm
+        yum install -y $LOCAL_PKG/scylla-*.*.rpm
    fi
 fi
--- a/53
+++ b/53
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/python
 #
 # Copyright (C) 2016 ScyllaDB
 #
@@ -19,11 +19,18 @@
 # You should have received a copy of the GNU General Public License
 # along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 #
+from __future__ import print_function

 import argparse
 import json
 import urllib
+import urllib2
 import requests
+import ConfigParser
+import os
+import sys
+import subprocess
+from pkg_resources import parse_version

 VERSION = "1.0"
 quiet = False
@@ -39,29 +46,42 @@ def traceln(*vals):
 def help(args):
    parser.print_help()

+def sh_command(*args):
+    p = subprocess.Popen(args, stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE)
+    out, err = p.communicate()
+    if err:
+        raise Exception(err)
+    return out
+
 def get_json_from_url(path):
-    req = urllib.request.Request(path)
-    try:
-        response = urllib.request.urlopen(req)
-        data = response.read()
-        encoding = response.info().get_content_charset('utf-8')
-        return json.loads(data.decode(encoding))
-    except urllib.error.URLError as e:
-        pass
-    return ""
+    data = sh_command("curl", "-s", "-X", "GET", path)
+    return json.loads(data)

 def get_api(path):
    return get_json_from_url("http://localhost:10000" + path)

+def version_compare(a, b):
+    return parse_version(a) < parse_version(b)
+
 def check_version(ar):
+    if config and (not config.has_option("housekeeping", "check-version") or not config.getboolean("housekeeping", "check-version")):
+        return
    current_version = get_api('/storage_service/scylla_release_version')
-    latest_version = get_json_from_url(version_url)["version"]
-    
-    if current_version != latest_version:
+    if current_version == "":
+        # API is down, nothing to do
+        return
+    try:
+        latest_version = get_json_from_url(version_url + "?version=" + current_version)["version"]
+    except:
+        traceln("Unable to retrieve version information")
+        return
+    if version_compare(current_version, latest_version):
        traceln("A new version was found, current version=", current_version, " latest version=", latest_version)

 parser = argparse.ArgumentParser(description='ScyllaDB help report tool', conflict_handler="resolve")
 parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Quiet mode')
+parser.add_argument('-c', '--config', default="", help='An optional config file. Specifying a missing file will terminate the script')

 subparsers = parser.add_subparsers(help='Available commands')
 parser_help = subparsers.add_parser('help', help='Display help information')
@@ -71,4 +91,11 @@ parser_system.set_defaults(func=check_version)

 args = parser.parse_args()
 quiet = args.quiet
+config = None
+if args.config != "":
+    if not os.path.isfile(args.config):
+        traceln("Config file ", args.config, " is missing, terminating")
+        sys.exit(0)
+    config = ConfigParser.SafeConfigParser()
+    config.read(args.config)
 args.func(args)
--- a/2
+++ b/2
--- a/service/pager/query_pagers.cc
+++ b/service/pager/query_pagers.cc
@@ -124,6 +124,43 @@ private:
                logger.trace("Result ranges {}", ranges);
            };

+            // Because of #1446 we don't have a comparator to use with
+            // range<clustering_key_prefix> which would produce correct results.
+            // This means we cannot reuse the same logic for dealing with
+            // partition and clustering keys.
+            auto modify_ck_ranges = [reversed] (const schema& s, auto& ranges, auto& lo) {
+                typedef typename std::remove_reference_t<decltype(ranges)>::value_type range_type;
+                typedef typename range_type::bound bound_type;
+
+                auto cmp = [reversed, bv_cmp = bound_view::compare(s)] (const auto& a, const auto& b) {
+                    return reversed ? bv_cmp(b, a) : bv_cmp(a, b);
+                };
+                auto start_bound = [reversed] (const auto& range) -> const bound_view& {
+                    return reversed ? range.second : range.first;
+                };
+                auto end_bound = [reversed] (const auto& range) -> const bound_view& {
+                    return reversed ? range.first : range.second;
+                };
+                clustering_key_prefix::equality eq(s);
+
+                auto it = ranges.begin();
+                while (it != ranges.end()) {
+                    auto range = bound_view::from_range(*it);
+                    if (cmp(end_bound(range), lo) || eq(end_bound(range).prefix, lo)) {
+                        logger.trace("Remove ck range {}", *it);
+                        it = ranges.erase(it);
+                        continue;
+                    } else if (cmp(start_bound(range), lo)) {
+                        assert(cmp(lo, end_bound(range)));
+                        auto r = reversed ? range_type(it->start(), bound_type { lo, false })
+                                          : range_type(bound_type { lo, false }, it->end());
+                        logger.trace("Modify ck range {} -> {}", *it, r);
+                        *it = std::move(r);
+                    }
+                    ++it;
+                }
+            };
+
            // last ck can be empty depending on whether we
            // deserialized state or not. This case means "last page ended on
            // something-not-bound-by-clustering" (i.e. a static row, alone)
@@ -136,15 +173,7 @@ private:
            if (has_ck) {
                query::clustering_row_ranges row_ranges = _cmd->slice.default_row_ranges();
                clustering_key_prefix ckp = clustering_key_prefix::from_exploded(*_schema, _last_ckey->explode(*_schema));
-                clustering_key_prefix::less_compare cmp_rt(*_schema);
-                modify_ranges(row_ranges, ckp, false, [&cmp_rt](auto& c1, auto c2) {
-                    if (cmp_rt(c1, c2)) {
-                        return -1;
-                    } else if (cmp_rt(c2, c1)) {
-                        return 1;
-                    }
-                    return 0;
-                });
+                modify_ck_ranges(*_schema, row_ranges, ckp);

                _cmd->slice.set_range(*_schema, *_last_pkey, row_ranges);
            }
@@ -166,7 +195,8 @@ private:
                );

        auto ranges = _ranges;
-        return get_local_storage_proxy().query(_schema, _cmd, std::move(ranges),
+        auto command = ::make_lw_shared<query::read_command>(*_cmd);
+        return get_local_storage_proxy().query(_schema, std::move(command), std::move(ranges),
                _options.get_consistency(), _state.get_trace_state()).then(
                [this, &builder, page_size, now](foreign_ptr<lw_shared_ptr<query::result>> results) {
                    handle_result(builder, std::move(results), page_size, now);
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -2128,10 +2128,13 @@ protected:
    future<foreign_ptr<lw_shared_ptr<reconcilable_result>>> make_mutation_data_request(lw_shared_ptr<query::read_command> cmd, gms::inet_address ep, clock_type::time_point timeout) {
        ++_proxy->_stats.mutation_data_read_attempts.get_ep_stat(ep);
        if (is_me(ep)) {
+            tracing::trace(_trace_state, "read_mutation_data: querying locally");
            return _proxy->query_mutations_locally(_schema, cmd, _partition_range);
        } else {
            auto& ms = net::get_local_messaging_service();
-            return ms.send_read_mutation_data(net::messaging_service::msg_addr{ep, 0}, timeout, *cmd, _partition_range).then([this](reconcilable_result&& result) {
+            tracing::trace(_trace_state, "read_mutation_data: sending a message to /{}", ep);
+            return ms.send_read_mutation_data(net::messaging_service::msg_addr{ep, 0}, timeout, *cmd, _partition_range).then([this, ep](reconcilable_result&& result) {
+                    tracing::trace(_trace_state, "read_mutation_data: got response from /{}", ep);
                    return make_foreign(::make_lw_shared<reconcilable_result>(std::move(result)));
            });
        }
@@ -2139,10 +2142,13 @@ protected:
    future<foreign_ptr<lw_shared_ptr<query::result>>> make_data_request(gms::inet_address ep, clock_type::time_point timeout) {
        ++_proxy->_stats.data_read_attempts.get_ep_stat(ep);
        if (is_me(ep)) {
+            tracing::trace(_trace_state, "read_data: querying locally");
            return _proxy->query_singular_local(_schema, _cmd, _partition_range);
        } else {
            auto& ms = net::get_local_messaging_service();
-            return ms.send_read_data(net::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range).then([this](query::result&& result) {
+            tracing::trace(_trace_state, "read_data: sending a message to /{}", ep);
+            return ms.send_read_data(net::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range).then([this, ep](query::result&& result) {
+                tracing::trace(_trace_state, "read_data: got response from /{}", ep);
                return make_foreign(::make_lw_shared<query::result>(std::move(result)));
            });
        }
@@ -2150,10 +2156,13 @@ protected:
    future<query::result_digest, api::timestamp_type> make_digest_request(gms::inet_address ep, clock_type::time_point timeout) {
        ++_proxy->_stats.digest_read_attempts.get_ep_stat(ep);
        if (is_me(ep)) {
+            tracing::trace(_trace_state, "read_digest: querying locally");
            return _proxy->query_singular_local_digest(_schema, _cmd, _partition_range);
        } else {
            auto& ms = net::get_local_messaging_service();
-            return ms.send_read_digest(net::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range).then([] (query::result_digest d, rpc::optional<api::timestamp_type> t) {
+            tracing::trace(_trace_state, "read_digest: sending a message to /{}", ep);
+            return ms.send_read_digest(net::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range).then([this, ep] (query::result_digest d, rpc::optional<api::timestamp_type> t) {
+                tracing::trace(_trace_state, "read_digest: got response from /{}", ep);
                return make_ready_future<query::result_digest, api::timestamp_type>(d, t ? t.value() : api::missing_timestamp);
            });
        }
@@ -2692,7 +2701,9 @@ storage_proxy::do_query(schema_ptr s,
        return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>>(make_foreign(make_lw_shared<query::result>()));
    };

-    if (partition_ranges.empty()) {
+    auto& slice = cmd->slice;
+    if (partition_ranges.empty() ||
+            (slice.default_row_ranges().empty() && !slice.get_specific_ranges())) {
        return make_empty();
    }
    utils::latency_counter lc;
@@ -3271,10 +3282,11 @@ void storage_proxy::init_messaging_service() {
        }

        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr)] (const query::partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
+            auto src_ip = src_addr.addr;
            return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p] (schema_ptr s) {
                return p->query_singular_local(std::move(s), cmd, pr);
-            }).finally([&trace_state_ptr] () mutable {
-                tracing::trace(trace_state_ptr, "read_data handling is done");
+            }).finally([&trace_state_ptr, src_ip] () mutable {
+                tracing::trace(trace_state_ptr, "read_data handling is done, sending a response to /{}", src_ip);
            });
        });
    });
@@ -3287,10 +3299,11 @@ void storage_proxy::init_messaging_service() {
            tracing::trace(trace_state_ptr, "read_mutation_data: message received from /{}", src_addr.addr);
        }
        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr)] (const query::partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
+            auto src_ip = src_addr.addr;
            return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p] (schema_ptr s) {
                return p->query_mutations_locally(std::move(s), cmd, pr);
-            }).finally([&trace_state_ptr] () mutable {
-                tracing::trace(trace_state_ptr, "read_mutation_data handling is done");
+            }).finally([&trace_state_ptr, src_ip] () mutable {
+                tracing::trace(trace_state_ptr, "read_mutation_data handling is done, sending a response to /{}", src_ip);
            });
        });
    });
@@ -3303,10 +3316,11 @@ void storage_proxy::init_messaging_service() {
            tracing::trace(trace_state_ptr, "read_digest: message received from /{}", src_addr.addr);
        }
        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr)] (const query::partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
+            auto src_ip = src_addr.addr;
            return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p] (schema_ptr s) {
                return p->query_singular_local_digest(std::move(s), cmd, pr);
-            }).finally([&trace_state_ptr] () mutable {
-                tracing::trace(trace_state_ptr, "read_digest handling is done");
+            }).finally([&trace_state_ptr, src_ip] () mutable {
+                tracing::trace(trace_state_ptr, "read_digest handling is done, sending a response to /{}", src_ip);
            });
        });
    });
@@ -3394,7 +3408,7 @@ public:

        boost::range::make_heap(_runs, cmp);

-        return repeat_until_value([this, cmp = std::move(cmp), partitions = std::vector<partition>(), row_count = 0u] () mutable {
+        return repeat_until_value([this, cmp = std::move(cmp), partitions = std::vector<partition>(), row_count = 0u, partition_count = 0u] () mutable {
            std::experimental::optional<reconcilable_result> ret;

            boost::range::pop_heap(_runs, cmp);
@@ -3414,6 +3428,7 @@ public:
                partitions.push_back(p);
                row_count += p._row_count;
            }
+            partition_count += p._row_count > 0;
            if (row_count < _cmd->row_limit) {
                next.advance();
                if (next.has_more()) {
@@ -3422,7 +3437,7 @@ public:
                    _runs.pop_back();
                }
            }
-            if (_runs.empty() || row_count >= _cmd->row_limit) {
+            if (_runs.empty() || row_count >= _cmd->row_limit || partition_count >= _cmd->partition_limit) {
                ret = reconcilable_result(row_count, std::move(partitions));
            }
            return make_ready_future<std::experimental::optional<reconcilable_result>>(std::move(ret));
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -297,6 +297,9 @@ public:
     *
     * Partitions for each range will be ordered according to decorated_key ordering. Results for
     * each range from "partition_ranges" may appear in any order.
+     *
+     * IMPORTANT: Not all fibers started by this method have to be done by the time it returns so no
+     * parameter can be changed after being passed to this method.
     */
    future<foreign_ptr<lw_shared_ptr<query::result>>> query(schema_ptr,
        lw_shared_ptr<query::read_command> cmd,
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -559,9 +559,9 @@ public:
        auto orig_map = get_range_to_address_map(keyspace, get_tokens_in_local_dc());
        std::unordered_map<range<token>, std::vector<inet_address>> filtered_map;
        for (auto entry : orig_map) {
-            filtered_map[entry.first].reserve(entry.second.size());
-            std::remove_copy_if(entry.second.begin(), entry.second.end(),
-                    filtered_map[entry.first].begin(), filter);
+            auto& addresses = filtered_map[entry.first];
+            addresses.reserve(entry.second.size());
+            std::copy_if(entry.second.begin(), entry.second.end(), std::back_inserter(addresses), filter);
        }

        return filtered_map;
--- a/sstables/column_name_helper.hh
+++ b/sstables/column_name_helper.hh
@@ -40,67 +40,37 @@
 #pragma once

 #include "core/sstring.hh"
+#include "schema.hh"
+#include "compound_compat.hh"
 #include <cmath>
 #include <algorithm>
 #include <vector>

 class column_name_helper {
 private:
-    static void may_grow(std::vector<bytes>& v, size_t target_size) {
+    static inline void may_grow(std::vector<bytes_opt>& v, size_t target_size) {
        if (target_size > v.size()) {
            v.resize(target_size);
        }
    }
 public:
-    static void min_max_components(std::vector<bytes>& min_seen, std::vector<bytes>& max_seen, const std::vector<bytes_view>& column_names) {
-        may_grow(min_seen, column_names.size());
-        may_grow(max_seen, column_names.size());
+    template <typename T>
+    static void min_max_components(const schema& schema, std::vector<bytes_opt>& min_seen, std::vector<bytes_opt>& max_seen, T components) {
+        may_grow(min_seen, schema.clustering_key_size());
+        may_grow(max_seen, schema.clustering_key_size());

-        for (auto i = 0U; i < column_names.size(); i++) {
-            auto& name = column_names[i];
-            if (max_seen[i].size() == 0 || name > bytes_view(max_seen[i])) {
-                max_seen[i] = bytes(name.data(), name.size());
+        auto& types = schema.clustering_key_type()->types();
+        auto i = 0U;
+        for (auto& value : components) {
+            auto& type = types[i];
+
+            if (!max_seen[i] || type->compare(value, max_seen[i].value()) > 0) {
+                max_seen[i] = bytes(value.data(), value.size());
            }
-            if (min_seen[i].size() == 0 || name < bytes_view(min_seen[i])) {
-                min_seen[i] = bytes(name.data(), name.size());
-            }
-        }
-    }
-
-    static void merge_max_components(std::vector<bytes>& to, std::vector<bytes>&& from) {
-        if (to.empty()) {
-            to = std::move(from);
-            return;
-        }
-
-        if (from.empty()) {
-            return;
-        }
-
-        may_grow(to, from.size());
-
-        for (auto i = 0U; i < from.size(); i++) {
-            if (to[i].size() == 0 || bytes_view(from[i]) > bytes_view(to[i])) {
-                to[i] = std::move(from[i]);
-            }
-        }
-    }
-
-    static void merge_min_components(std::vector<bytes>& to, std::vector<bytes>&& from) {
-        if (to.empty()) {
-            to = std::move(from);
-        }
-
-        if (from.empty()) {
-            return;
-        }
-
-        may_grow(to, from.size());
-
-        for (auto i = 0U; i < from.size(); i++) {
-            if (to[i].size() == 0 || bytes_view(from[i]) < bytes_view(to[i])) {
-                to[i] = std::move(from[i]);
+            if (!min_seen[i] || type->compare(value, min_seen[i].value()) < 0) {
+                min_seen[i] = bytes(value.data(), value.size());
            }
+            i++;
        }
    }
 };
--- a/sstables/consumer.hh
+++ b/sstables/consumer.hh
@@ -262,10 +262,10 @@ public:
            // We received more data than we actually care about, so process
            // the beginning of the buffer, and return the rest to the stream
            auto segment = data.share(0, _remain);
-            process(segment);
+            auto ret = process(segment);
            data.trim_front(_remain - segment.size());
            _remain -= (_remain - segment.size());
-            if (_remain == 0) {
+            if (_remain == 0 && ret == proceed::yes) {
                verify_end_state();
            }
            return make_ready_future<unconsumed_remainder>(std::move(data));
--- a/sstables/date_tiered_compaction_strategy.hh
+++ b/sstables/date_tiered_compaction_strategy.hh
@@ -48,24 +48,93 @@
 #include <iterator>
 #include "sstables.hh"
 #include "compaction.hh"
+#include "timestamp.hh"
+#include "cql3/statements/property_definitions.hh"

 static constexpr double DEFAULT_MAX_SSTABLE_AGE_DAYS = 365;
 static constexpr int64_t DEFAULT_BASE_TIME_SECONDS = 60;

+struct duration_conversor {
+    // Convert given duration to TargetDuration and return value as timestamp.
+    template <typename TargetDuration, typename SourceDuration>
+    static api::timestamp_type convert(SourceDuration d) {
+        return std::chrono::duration_cast<TargetDuration>(d).count();
+    }
+
+    // Convert given duration to duration that is represented by the string
+    // target_duration, and return value as timestamp.
+    template <typename SourceDuration>
+    static api::timestamp_type convert(const sstring& target_duration, SourceDuration d) {
+        if (target_duration == "HOURS") {
+            return convert<std::chrono::hours>(d);
+        } else if (target_duration == "MICROSECONDS") {
+            return convert<std::chrono::microseconds>(d);
+        } else if (target_duration == "MILLISECONDS") {
+            return convert<std::chrono::milliseconds>(d);
+        } else if (target_duration == "MINUTES") {
+            return convert<std::chrono::minutes>(d);
+        } else if (target_duration == "NANOSECONDS") {
+            return convert<std::chrono::nanoseconds>(d);
+        } else if (target_duration == "SECONDS") {
+            return convert<std::chrono::seconds>(d);
+        } else {
+            throw std::runtime_error(sprint("target duration %s is not available", target_duration));
+        }
+    }
+};
+
+class date_tiered_compaction_strategy_options {
+    const sstring DEFAULT_TIMESTAMP_RESOLUTION = "MICROSECONDS";
+    const sstring TIMESTAMP_RESOLUTION_KEY = "timestamp_resolution";
+    const sstring MAX_SSTABLE_AGE_KEY = "max_sstable_age_days";
+    const sstring BASE_TIME_KEY = "base_time_seconds";
+
+    api::timestamp_type max_sstable_age;
+    api::timestamp_type base_time;
+public:
+    date_tiered_compaction_strategy_options(const std::map<sstring, sstring>& options) {
+        using namespace cql3::statements;
+
+        auto tmp_value = get_value(options, TIMESTAMP_RESOLUTION_KEY);
+        auto target_unit = tmp_value ? tmp_value.value() : DEFAULT_TIMESTAMP_RESOLUTION;
+
+        tmp_value = get_value(options, MAX_SSTABLE_AGE_KEY);
+        auto fractional_days = property_definitions::to_double(MAX_SSTABLE_AGE_KEY, tmp_value, DEFAULT_MAX_SSTABLE_AGE_DAYS);
+        int64_t max_sstable_age_in_hours = std::lround(fractional_days * 24);
+        max_sstable_age = duration_conversor::convert(target_unit, std::chrono::hours(max_sstable_age_in_hours));
+
+        tmp_value = get_value(options, BASE_TIME_KEY);
+        auto base_time_seconds = property_definitions::to_long(BASE_TIME_KEY, tmp_value, DEFAULT_BASE_TIME_SECONDS);
+        base_time = duration_conversor::convert(target_unit, std::chrono::seconds(base_time_seconds));
+    }
+
+    date_tiered_compaction_strategy_options() {
+        auto max_sstable_age_in_hours = int64_t(DEFAULT_MAX_SSTABLE_AGE_DAYS * 24);
+        max_sstable_age = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::hours(max_sstable_age_in_hours)).count();
+        base_time = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::seconds(DEFAULT_BASE_TIME_SECONDS)).count();
+    }
+private:
+    static std::experimental::optional<sstring> get_value(const std::map<sstring, sstring>& options, const sstring& name) {
+        auto it = options.find(name);
+        if (it == options.end()) {
+            return std::experimental::nullopt;
+        }
+        return it->second;
+    }
+
+    friend class date_tiered_manifest;
+};
+
 class date_tiered_manifest {
    static logging::logger logger;

-    // TODO: implement date_tiered_compaction_strategy_options.
-    db_clock::duration _max_sstable_age;
-    db_clock::duration _base_time;
+    date_tiered_compaction_strategy_options _options;
 public:
    date_tiered_manifest() = delete;

-    date_tiered_manifest(const std::map<sstring, sstring>& options) {
-        auto max_sstable_age_in_hours = int64_t(DEFAULT_MAX_SSTABLE_AGE_DAYS * 24);
-        _max_sstable_age = std::chrono::duration_cast<db_clock::duration>(std::chrono::hours(max_sstable_age_in_hours));
-        _base_time = std::chrono::duration_cast<db_clock::duration>(std::chrono::seconds(DEFAULT_BASE_TIME_SECONDS));
-
+    date_tiered_manifest(const std::map<sstring, sstring>& options)
+        : _options(options)
+    {
        // FIXME: implement option to disable tombstone compaction.
 #if 0
        if (!options.containsKey(AbstractCompactionStrategy.TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.containsKey(AbstractCompactionStrategy.TOMBSTONE_THRESHOLD_OPTION))
@@ -119,8 +188,8 @@ public:
        for (auto& entry : *cf.get_sstables()) {
            sstables.push_back(entry);
        }
-        auto candidates = filter_old_sstables(sstables, _max_sstable_age, now);
-        auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _base_time, base, now);
+        auto candidates = filter_old_sstables(sstables, _options.max_sstable_age, now);
+        auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _options.base_time, base, now);

        for (auto& bucket : buckets) {
            if (bucket.size() >= size_t(cf.schema()->min_compaction_threshold())) {
@@ -161,11 +230,11 @@ private:
    get_compaction_candidates(column_family& cf, std::vector<sstables::shared_sstable> candidate_sstables, int64_t now, int base) {
        int min_threshold = cf.schema()->min_compaction_threshold();
        int max_threshold = cf.schema()->max_compaction_threshold();
-        auto candidates = filter_old_sstables(candidate_sstables, _max_sstable_age, now);
+        auto candidates = filter_old_sstables(candidate_sstables, _options.max_sstable_age, now);

-        auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _base_time, base, now);
+        auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _options.base_time, base, now);

-        return newest_bucket(buckets, min_threshold, max_threshold, now, _base_time);
+        return newest_bucket(buckets, min_threshold, max_threshold, now, _options.base_time);
    }

    /**
@@ -186,12 +255,11 @@ private:
     * @return a list of sstables with the oldest sstables excluded
     */
    static std::vector<sstables::shared_sstable>
-    filter_old_sstables(std::vector<sstables::shared_sstable> sstables, db_clock::duration max_sstable_age, int64_t now) {
-        int64_t max_sstable_age_count = std::chrono::duration_cast<std::chrono::microseconds>(max_sstable_age).count();
-        if (max_sstable_age_count == 0) {
+    filter_old_sstables(std::vector<sstables::shared_sstable> sstables, api::timestamp_type max_sstable_age, int64_t now) {
+        if (max_sstable_age == 0) {
            return sstables;
        }
-        int64_t cutoff = now - max_sstable_age_count;
+        int64_t cutoff = now - max_sstable_age;

        sstables.erase(std::remove_if(sstables.begin(), sstables.end(), [cutoff] (auto& sst) {
            return sst->get_stats_metadata().max_timestamp < cutoff;
@@ -275,14 +343,14 @@ private:
     *         Each bucket is also a list of files ordered from newest to oldest.
     */
    std::vector<std::vector<sstables::shared_sstable>>
-    get_buckets(std::vector<std::pair<sstables::shared_sstable,int64_t>>&& files, db_clock::duration time_unit, int base, int64_t now) const {
+    get_buckets(std::vector<std::pair<sstables::shared_sstable,int64_t>>&& files, api::timestamp_type time_unit, int base, int64_t now) const {
        // Sort files by age. Newest first.
        std::sort(files.begin(), files.end(), [] (auto& i, auto& j) {
            return i.second > j.second;
        });

        std::vector<std::vector<sstables::shared_sstable>> buckets;
-        auto target = get_initial_target(now, std::chrono::duration_cast<std::chrono::microseconds>(time_unit).count());
+        auto target = get_initial_target(now, time_unit);
        auto it = files.begin();

        while (it != files.end()) {
@@ -329,12 +397,12 @@ private:
     */
    std::vector<sstables::shared_sstable>
    newest_bucket(std::vector<std::vector<sstables::shared_sstable>>& buckets, int min_threshold, int max_threshold,
-            int64_t now, db_clock::duration base_time) {
+            int64_t now, api::timestamp_type base_time) {

        // If the "incoming window" has at least minThreshold SSTables, choose that one.
        // For any other bucket, at least 2 SSTables is enough.
        // In any case, limit to maxThreshold SSTables.
-        target incoming_window = get_initial_target(now, std::chrono::duration_cast<std::chrono::microseconds>(base_time).count());
+        target incoming_window = get_initial_target(now, base_time);
        for (auto& bucket : buckets) {
            auto min_timestamp = bucket.front()->get_stats_metadata().min_timestamp;
            if (bucket.size() >= size_t(min_threshold) ||
--- a/sstables/leveled_manifest.hh
+++ b/sstables/leveled_manifest.hh
@@ -747,9 +747,16 @@ public:
    int64_t get_estimated_tasks() {
        int64_t tasks = 0;

-        for (auto i = _generations.size() - 1; i >= 0; i--) {
+        for (int i = static_cast<int>(_generations.size()) - 1; i >= 0; i--) {
            const auto& sstables = get_level(i);
-            tasks += std::max(0UL, get_total_bytes(sstables) - max_bytes_for_level(i)) / _max_sstable_size_in_bytes;
+            uint64_t total_bytes_for_this_level = get_total_bytes(sstables);
+            uint64_t max_bytes_for_this_level = max_bytes_for_level(i);
+
+            if (total_bytes_for_this_level < max_bytes_for_this_level) {
+                continue;
+            }
+            // add to tasks an estimate about number of sstables that make this level go beyond its limit.
+            tasks += (total_bytes_for_this_level - max_bytes_for_this_level) / _max_sstable_size_in_bytes;
        }
        return tasks;
    }
--- a/sstables/metadata_collector.hh
+++ b/sstables/metadata_collector.hh
@@ -155,10 +155,6 @@ struct column_stats {
    /** histogram of tombstone drop time */
    streaming_histogram tombstone_histogram;

-    /** max and min column names according to comparator */
-    std::vector<bytes> min_column_names;
-    std::vector<bytes> max_column_names;
-
    bool has_legacy_counter_shards;

    column_stats() :
@@ -211,8 +207,8 @@ private:
    std::set<int> _ancestors;
    streaming_histogram _estimated_tombstone_drop_time{TOMBSTONE_HISTOGRAM_BIN_SIZE};
    int _sstable_level = 0;
-    std::vector<bytes> _min_column_names;
-    std::vector<bytes> _max_column_names;
+    std::vector<bytes_opt> _min_column_names;
+    std::vector<bytes_opt> _max_column_names;
    bool _has_legacy_counter_shards = false;

    /**
@@ -226,10 +222,14 @@ private:
    /*
     * Convert a vector of bytes into a disk array of disk_string<uint16_t>.
     */
-    static void convert(disk_array<uint32_t, disk_string<uint16_t>>&to, std::vector<bytes>&& from) {
-        to.elements.resize(from.size());
+    static void convert(disk_array<uint32_t, disk_string<uint16_t>>&to, std::vector<bytes_opt>&& from) {
        for (auto i = 0U; i < from.size(); i++) {
-            to.elements[i].value = std::move(from[i]);
+            if (!from[i]) {
+                break;
+            }
+            disk_string<uint16_t> s;
+            s.value = std::move(from[i].value());
+            to.elements.push_back(std::move(s));
        }
    }
 public:
@@ -286,31 +286,25 @@ public:
        _sstable_level = sstable_level;
    }

-    void update_min_column_names(std::vector<bytes>&& min_column_names) {
-        if (min_column_names.size() > 0) {
-            column_name_helper::merge_min_components(_min_column_names, std::move(min_column_names));
-        }
+    std::vector<bytes_opt>& min_column_names() {
+        return _min_column_names;
    }

-    void update_max_column_names(std::vector<bytes>&& max_column_names) {
-        if (max_column_names.size() > 0) {
-            column_name_helper::merge_max_components(_max_column_names, std::move(max_column_names));
-        }
+    std::vector<bytes_opt>& max_column_names() {
+        return _max_column_names;
    }

    void update_has_legacy_counter_shards(bool has_legacy_counter_shards) {
        _has_legacy_counter_shards = _has_legacy_counter_shards || has_legacy_counter_shards;
    }

-    void update(column_stats&& stats) {
+    void update(const schema& s, column_stats&& stats) {
        update_min_timestamp(stats.min_timestamp.get());
        update_max_timestamp(stats.max_timestamp.get());
        update_max_local_deletion_time(stats.max_local_deletion_time.get());
        add_row_size(stats.row_size);
        add_column_count(stats.column_count);
        merge_tombstone_histogram(stats.tombstone_histogram);
-        update_min_column_names(std::move(stats.min_column_names));
-        update_max_column_names(std::move(stats.max_column_names));
        update_has_legacy_counter_shards(stats.has_legacy_counter_shards);
    }

--- a/sstables/partition.cc
+++ b/sstables/partition.cc
@@ -28,6 +28,7 @@
 #include "unimplemented.hh"
 #include "utils/move.hh"
 #include "dht/i_partitioner.hh"
+#include <seastar/core/byteorder.hh>

 namespace sstables {

@@ -107,10 +108,12 @@ private:
    key_view _key;
    const io_priority_class* _pc = nullptr;
    query::clustering_key_filtering_context _ck_filtering;
-    query::clustering_key_filter _filter;
+    bool _in_current_ck_range = false;
+    query::clustering_row_ranges::const_iterator _current_ck_range;
+    query::clustering_row_ranges::const_iterator _ck_range_end;

-    bool _skip_partition;
-    bool _skip_clustering_row;
+    bool _skip_partition = false;
+    bool _skip_clustering_row = false;

    // We don't have "end of clustering row" markers. So we know that the current
    // row has ended once we get something (e.g. a live cell) that belongs to another
@@ -123,8 +126,9 @@ private:
    mutation_fragment_opt _ready;

    stdx::optional<new_mutation> _mutation;
-    bool _is_mutation_end;
+    bool _is_mutation_end = false;

+public:
    struct column {
        bool is_static;
        bytes_view col_name;
@@ -134,6 +138,7 @@ private:
        bytes collection_extra_data;
        bytes cell;
        const column_definition *cdef;
+        bool is_present;

        static constexpr size_t static_size = 2;

@@ -156,36 +161,32 @@ private:
            throw malformed_sstable_exception(sprint("Found %d clustering elements in column name. Was not expecting that!", clustering.size()));
        }

-        bool is_present(api::timestamp_type timestamp) {
-            return cdef && timestamp > cdef->dropped_at();
+        static bool check_static(const schema& schema, bytes_view col) {
+            return composite_view(col, schema.is_compound()).is_static();
        }

-        static bool check_static(bytes_view col) {
-            static bytes static_row(static_size, 0xff);
-            return col.compare(0, static_size, static_row) == 0;
+        static bytes_view fix_static_name(const schema& schema, bytes_view col) {
+            return fix_static_name(col, check_static(schema, col));
        }

-        static bytes_view fix_static_name(bytes_view col) {
-            if (check_static(col)) {
+        static bytes_view fix_static_name(bytes_view col, bool is_static) {
+            if(is_static) {
                col.remove_prefix(static_size);
            }
            return col;
        }

        std::vector<bytes> extract_clustering_key(const schema& schema) {
-            if (!schema.is_compound()) {
-                return { to_bytes(col_name) };
-            } else {
-                return composite_view(col_name).explode();
-            }
+            return composite_view(col_name, schema.is_compound()).explode();
        }
-        column(const schema& schema, bytes_view col)
-            : is_static(check_static(col))
-            , col_name(fix_static_name(col))
+        column(const schema& schema, bytes_view col, api::timestamp_type timestamp)
+            : is_static(check_static(schema, col))
+            , col_name(fix_static_name(col, is_static))
            , clustering(extract_clustering_key(schema))
            , collection_extra_data(is_collection(schema) ? pop_back(clustering) : bytes()) // collections are not supported with COMPACT STORAGE, so this is fine
            , cell(!schema.is_dense() ? pop_back(clustering) : (*(schema.regular_begin())).name()) // dense: cell name is not provided. It is the only regular column
            , cdef(schema.get_column_definition(cell))
+            , is_present(cdef && timestamp > cdef->dropped_at())
        {

            if (is_static) {
@@ -195,9 +196,15 @@ private:
                    }
                }
            }
+
+            if (is_present && is_static != cdef->is_static()) {
+                throw malformed_sstable_exception(seastar::format("Mismatch between {} cell and {} column definition",
+                        is_static ? "static" : "non-static", cdef->is_static() ? "static" : "non-static"));
+            }
        }
    };

+private:
    // Notes for collection mutation:
    //
    // While we could in theory generate the mutation for the elements as they
@@ -266,6 +273,45 @@ private:
        }
    }

+    // We rely on the fact that the first 'S' in SSTables stands for 'sorted'
+    // and the clustering row keys are always in an ascending order.
+    bool is_in_range(const clustering_key_prefix& ck) {
+        // This is a wrong comparator to use here, but at the moment the correct
+        // one has a very serious disadvantage of not existing (see #1446).
+        clustering_key_prefix::prefix_equality_less_compare cmp(*_schema);
+
+        while (_current_ck_range != _ck_range_end) {
+            if (!_in_current_ck_range && _current_ck_range->start()) {
+                auto& start = *_current_ck_range->start();
+                if ((start.is_inclusive() && cmp(ck, start.value())) || (!start.is_inclusive() && !cmp(start.value(), ck))) {
+                    return false;
+                }
+            }
+            // All subsequent clustering keys are larger than the start of this
+            // range so there is no need to check that again.
+            _in_current_ck_range = true;
+
+            if (!_current_ck_range->end()) {
+                return true;
+            }
+
+            auto& end = *_current_ck_range->end();
+            if ((!end.is_inclusive() && cmp(ck, end.value())) || (end.is_inclusive() && !cmp(end.value(), ck))) {
+                return true;
+            }
+
+            ++_current_ck_range;
+            _in_current_ck_range = false;
+        }
+        return false;
+    }
+
+    void set_up_ck_ranges(const partition_key& pk) {
+        auto& range = _ck_filtering.get_ranges(pk);
+        _current_ck_range = range.begin();
+        _ck_range_end = range.end();
+        _in_current_ck_range = false;
+    }
 public:
    mutation_opt mut;

@@ -277,8 +323,9 @@ public:
            , _key(key_view(key))
            , _pc(&pc)
            , _ck_filtering(ck_filtering)
-            , _filter(_ck_filtering.get_filter_for_sorted(partition_key::from_exploded(*_schema, key.explode(*_schema))))
-    { }
+    {
+        set_up_ck_ranges(partition_key::from_exploded(*_schema, key.explode(*_schema)));
+    }

    mp_row_consumer(const key& key,
                    const schema_ptr schema,
@@ -305,7 +352,7 @@ public:
            _is_mutation_end = false;
            _skip_partition = false;
            _skip_clustering_row = false;
-            _filter = _ck_filtering.get_filter_for_sorted(_mutation->key);
+            set_up_ck_ranges(_mutation->key);
            return proceed::no;
        } else {
            throw malformed_sstable_exception(sprint("Key mismatch. Got %s while processing %s", to_hex(bytes_view(key)).c_str(), to_hex(bytes_view(_key)).c_str()));
@@ -344,7 +391,7 @@ public:
            flush();
        }
        if (!_in_progress) {
-            _skip_clustering_row = !is_static && !_filter(pos.key());
+            _skip_clustering_row = !is_static && !is_in_range(pos.key());
            if (is_static) {
                _in_progress = mutation_fragment(static_row());
            } else {
@@ -384,7 +431,7 @@ public:
            return proceed::yes;
        }

-        struct column col(*_schema, col_name);
+        struct column col(*_schema, col_name, timestamp);

        auto clustering_prefix = exploded_clustering_prefix(std::move(col.clustering));
        auto ret = flush_if_needed(col.is_static, clustering_prefix);
@@ -398,7 +445,7 @@ public:
            return ret;
        }

-        if (!col.is_present(timestamp)) {
+        if (!col.is_present) {
            return ret;
        }

@@ -426,10 +473,11 @@ public:
            return proceed::yes;
        }

-        struct column col(*_schema, col_name);
+        auto timestamp = deltime.marked_for_delete_at;
+        struct column col(*_schema, col_name, timestamp);
        gc_clock::duration secs(deltime.local_deletion_time);

-        return consume_deleted_cell(col, deltime.marked_for_delete_at, gc_clock::time_point(secs));
+        return consume_deleted_cell(col, timestamp, gc_clock::time_point(secs));
    }

    proceed consume_deleted_cell(column &col, int64_t timestamp, gc_clock::time_point ttl) {
@@ -444,7 +492,7 @@ public:
            _in_progress->as_clustering_row().apply(rm);
            return ret;
        }
-        if (!col.is_present(timestamp)) {
+        if (!col.is_present) {
            return ret;
        }

@@ -510,7 +558,7 @@ public:
            return proceed::yes;
        }

-        auto start = composite_view(column::fix_static_name(start_col)).explode();
+        auto start = composite_view(column::fix_static_name(*_schema, start_col)).explode();

        // Note how this is slightly different from the check in is_collection. Collection tombstones
        // do not have extra data.
@@ -520,7 +568,7 @@ public:
        if (start.size() <= _schema->clustering_key_size()) {
            auto start_ck = clustering_key_prefix::from_exploded(std::move(start));
            auto start_kind = start_marker_to_bound_kind(start_col);
-            auto end = clustering_key_prefix::from_exploded(composite_view(column::fix_static_name(end_col)).explode());
+            auto end = clustering_key_prefix::from_exploded(composite_view(column::fix_static_name(*_schema, end_col)).explode());
            auto end_kind = end_marker_to_bound_kind(end_col);
            if (range_tombstone::is_single_clustering_row_tombstone(*_schema, start_ck, start_kind, end, end_kind)) {
                auto ret = flush_if_needed(std::move(start_ck));
@@ -555,8 +603,8 @@ public:
        return *_pc;
    }

-    bool is_mutation_end() const {
-        return _is_mutation_end;
+    bool get_and_reset_is_mutation_end() {
+        return std::exchange(_is_mutation_end, false);
    }

    stdx::optional<new_mutation> get_mutation() {
@@ -576,47 +624,96 @@ public:
    }
 };

+struct sstable_data_source {
+    shared_sstable _sst;
+    mp_row_consumer _consumer;
+    data_consume_context _context;
+
+    sstable_data_source(shared_sstable sst, mp_row_consumer&& consumer)
+        : _sst(std::move(sst))
+        , _consumer(std::move(consumer))
+        , _context(_sst->data_consume_rows(_consumer))
+    { }
+
+    sstable_data_source(shared_sstable sst, mp_row_consumer&& consumer, sstable::disk_read_range toread)
+        : _sst(std::move(sst))
+        , _consumer(std::move(consumer))
+        , _context(_sst->data_consume_rows(_consumer, std::move(toread)))
+    { }
+
+    sstable_data_source(schema_ptr s, shared_sstable sst, const sstables::key& k, const io_priority_class& pc,
+            query::clustering_key_filtering_context ck_filtering, sstable::disk_read_range toread)
+        : _sst(std::move(sst))
+        , _consumer(k, s, ck_filtering, pc)
+        , _context(_sst->data_consume_rows(_consumer, std::move(toread)))
+    { }
+};
+
 class sstable_streamed_mutation : public streamed_mutation::impl {
-    data_consume_context& _context;
-    mp_row_consumer& _consumer;
+    lw_shared_ptr<sstable_data_source> _ds;
    tombstone _t;
    bool _finished = false;
    range_tombstone_stream _range_tombstones;
+    mutation_fragment_opt _current_candidate;
    mutation_fragment_opt _next_candidate;
+    stdx::optional<position_in_partition> _last_position;
+    position_in_partition::less_compare _cmp;
+    position_in_partition::equal_compare _eq;
 private:
-    future<mutation_fragment_opt> read_next() {
+    future<stdx::optional<mutation_fragment_opt>> read_next() {
        // Because of #1203 we may encounter sstables with range tombstones
        // placed earler than expected.
-        if (_next_candidate) {
-            auto mf = _range_tombstones.get_next(*_next_candidate);
+        if (_next_candidate || (_current_candidate && _finished)) {
+            assert(_current_candidate);
+            auto mf = _range_tombstones.get_next(*_current_candidate);
            if (!mf) {
-                mf = move_and_disengage(_next_candidate);
+                mf = move_and_disengage(_current_candidate);
+                _current_candidate = move_and_disengage(_next_candidate);
            }
-            return make_ready_future<mutation_fragment_opt>(std::move(mf));
+            return make_ready_future<stdx::optional<mutation_fragment_opt>>(std::move(mf));
        }
        if (_finished) {
-            return make_ready_future<mutation_fragment_opt>(_range_tombstones.get_next());
+            // No need to update _last_position here. We've already read everything from the sstable.
+            return make_ready_future<stdx::optional<mutation_fragment_opt>>(_range_tombstones.get_next());
        }
-        return _context.read().then([this] {
-            if (_consumer.is_mutation_end()) {
-                _finished = true;
+        return _ds->_context.read().then([this] {
+            _finished = _ds->_consumer.get_and_reset_is_mutation_end();
+            auto mf = _ds->_consumer.get_mutation_fragment();
+            if (mf) {
+                if (mf->is_range_tombstone()) {
+                    // If sstable uses promoted index it will repeat relevant range tombstones in
+                    // each block. Do not emit these duplicates as they will break the guarantee
+                    // that mutation fragment are produced in ascending order.
+                    if (!_last_position || !_cmp(*mf, *_last_position)) {
+                        _last_position = mf->position();
+                        _range_tombstones.apply(std::move(mf->as_range_tombstone()));
+                    }
+                } else {
+                    // mp_row_consumer may produce mutation_fragments in parts if they are
+                    // interrupted by range tombstone duplicate. Make sure they are merged
+                    // before emitting them.
+                    _last_position = mf->position();
+                    if (!_current_candidate) {
+                        _current_candidate = std::move(mf);
+                    } else if (_current_candidate && _eq(*_current_candidate, *mf)) {
+                        _current_candidate->apply(*_schema, std::move(*mf));
+                    } else {
+                        _next_candidate = std::move(mf);
+                    }
+                }
            }
-            auto mf = _consumer.get_mutation_fragment();
-            if (mf && mf->is_range_tombstone()) {
-                _range_tombstones.apply(std::move(mf->as_range_tombstone()));
-            } else {
-                _next_candidate = std::move(mf);
-            }
-            return read_next();
+            return stdx::optional<mutation_fragment_opt>();
        });
    }
 public:
-    sstable_streamed_mutation(schema_ptr s, dht::decorated_key dk, data_consume_context& context, mp_row_consumer& consumer, tombstone t)
-        : streamed_mutation::impl(s, std::move(dk), t), _context(context), _consumer(consumer), _t(t), _range_tombstones(*s) { }
+    sstable_streamed_mutation(schema_ptr s, dht::decorated_key dk, tombstone t, lw_shared_ptr<sstable_data_source> ds)
+        : streamed_mutation::impl(s, std::move(dk), t), _ds(std::move(ds)), _t(t), _range_tombstones(*s), _cmp(*s), _eq(*s) { }

    virtual future<> fill_buffer() final override {
        return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
-            return read_next().then([this] (mutation_fragment_opt&& mfopt) {
+            return repeat_until_value([this] {
+                return read_next();
+            }).then([this] (mutation_fragment_opt&& mfopt) {
                if (!mfopt) {
                    _end_of_stream = true;
                } else {
@@ -625,38 +722,17 @@ public:
            });
        });
    }
-};

-class sstable_single_streamed_mutation final : public sstable_streamed_mutation {
-    struct data_source {
-        mp_row_consumer _consumer;
-        data_consume_context _context;
-
-        data_source(schema_ptr s, sstable& sst, const sstables::key& k, const io_priority_class& pc,
-                    query::clustering_key_filtering_context ck_filtering, uint64_t start, uint64_t end)
-            : _consumer(k, s, ck_filtering, pc)
-            , _context(sst.data_consume_rows(_consumer, start, end))
-        {
-        }
-    };
-
-    lw_shared_ptr<data_source> _data_source;
-public:
-    sstable_single_streamed_mutation(schema_ptr s, dht::decorated_key dk, tombstone t, lw_shared_ptr<data_source> ds)
-        : sstable_streamed_mutation(std::move(s), std::move(dk), ds->_context, ds->_consumer, t)
-        , _data_source(ds)
-    { }
-
-    static future<streamed_mutation> create(schema_ptr s, sstable& sst, const sstables::key& k,
+    static future<streamed_mutation> create(schema_ptr s, shared_sstable sst, const sstables::key& k,
                                            query::clustering_key_filtering_context ck_filtering,
-                                            const io_priority_class& pc, uint64_t start, uint64_t end)
+                                            const io_priority_class& pc, sstable::disk_read_range toread)
    {
-        auto ds = make_lw_shared<data_source>(s, sst, k, pc, ck_filtering, start, end);
+        auto ds = make_lw_shared<sstable_data_source>(s, sst, k, pc, ck_filtering, std::move(toread));
        return ds->_context.read().then([s, ds] {
            auto mut = ds->_consumer.get_mutation();
            assert(mut);
            auto dk = dht::global_partitioner().decorate_key(*s, std::move(mut->key));
-            return make_streamed_mutation<sstable_single_streamed_mutation>(s, std::move(dk), mut->tomb, ds);
+            return make_streamed_mutation<sstable_streamed_mutation>(s, std::move(dk), mut->tomb, ds);
        });
    }
 };
@@ -704,37 +780,198 @@ sstables::sstable::read_row(schema_ptr schema,
    if (!filter_has_key(key)) {
        return make_ready_future<streamed_mutation_opt>();
    }
+    return find_disk_ranges(schema, key, ck_filtering, pc).then([this, &key, ck_filtering, &pc, schema] (disk_read_range toread) {
+        if (!toread.found_row()) {
+            _filter_tracker.add_false_positive();
+        }
+        if (!toread) {
+            return make_ready_future<streamed_mutation_opt>();
+        }
+        _filter_tracker.add_true_positive();
+        return sstable_streamed_mutation::create(schema, this->shared_from_this(), key, ck_filtering, pc, std::move(toread)).then([] (auto sm) {
+            return streamed_mutation_opt(std::move(sm));
+        });
+    });
+}

+template <typename T>
+static inline T read_be(const signed char* p) {
+    return ::read_be<T>(reinterpret_cast<const char*>(p));
+}
+
+template<typename T>
+static inline T consume_be(bytes_view& p) {
+    T i = read_be<T>(p.data());
+    p.remove_prefix(sizeof(T));
+    return i;
+}
+
+static inline bytes_view consume_bytes(bytes_view& p, size_t len) {
+    auto ret = bytes_view(p.data(), len);
+    p.remove_prefix(len);
+    return ret;
+}
+
+static inline clustering_key_prefix get_clustering_key(
+        const schema& schema, bytes_view col_name) {
+    mp_row_consumer::column col(schema, std::move(col_name), api::max_timestamp);
+    return std::move(col.clustering);
+}
+
+static bool has_static_columns(const schema& schema, index_entry &ie) {
+    // We can easily check if there are any static columns in this partition,
+    // because the static columns always come first, so the first promoted
+    // index block will start with one, if there are any. The name of a static
+    // column is a  composite beginning with a special marker (0xffff).
+    // But we can only assume the column name is composite if the schema is
+    // compound - if it isn't, we cannot have any static columns anyway.
+    //
+    // The first 18 bytes are deletion times (4+8), num blocks (4), and
+    // length of start column (2). Then come the actual column name bytes.
+    // See also composite::is_static().
+    auto data = ie.get_promoted_index_bytes();
+    return schema.is_compound() && data.size() >= 20 && data[18] == -1 && data[19] == -1;
+}
+
+future<sstable::disk_read_range>
+sstables::sstable::find_disk_ranges(
+        schema_ptr schema, const sstables::key& key,
+        query::clustering_key_filtering_context ck_filtering,
+        const io_priority_class& pc) {
    auto& partitioner = dht::global_partitioner();
    auto token = partitioner.get_token(key_view(key));

-    auto& summary = _summary;
-
-    if (token < partitioner.get_token(key_view(summary.first_key.value))
-            || token > partitioner.get_token(key_view(summary.last_key.value))) {
-        _filter_tracker.add_false_positive();
-        return make_ready_future<streamed_mutation_opt>();
+    if (token < partitioner.get_token(key_view(_summary.first_key.value))
+            || token > partitioner.get_token(key_view(_summary.last_key.value))) {
+        return make_ready_future<disk_read_range>();
    }
-
-    auto summary_idx = adjust_binary_search_index(binary_search(summary.entries, key, token));
+    auto summary_idx = adjust_binary_search_index(binary_search(_summary.entries, key, token));
    if (summary_idx < 0) {
-        _filter_tracker.add_false_positive();
-        return make_ready_future<streamed_mutation_opt>();
+        return make_ready_future<disk_read_range>();
    }

    return read_indexes(summary_idx, pc).then([this, schema, ck_filtering, &key, token, summary_idx, &pc] (auto index_list) {
        auto index_idx = this->binary_search(index_list, key, token);
        if (index_idx < 0) {
-            _filter_tracker.add_false_positive();
-            return make_ready_future<streamed_mutation_opt>();
+            return make_ready_future<disk_read_range>();
        }
-        _filter_tracker.add_true_positive();
+        index_entry& ie = index_list[index_idx];
+        if (ie.get_promoted_index_bytes().size() >= 16) {
+            auto&& pkey = partition_key::from_exploded(*schema, key.explode(*schema));
+            auto& ck_ranges = ck_filtering.get_ranges(pkey);
+            if (ck_ranges.size() == 1 && ck_ranges[0].is_full()) {
+                // When no clustering filter is given to sstable::read_row(),
+                // we get here one range unbounded on both sides. This is fine
+                // (the code below will work with an unbounded range), but
+                // let's drop this range to revert to the classic behavior of
+                // reading entire sstable row without using the promoted index
+            } else if (ck_filtering.want_static_columns(pkey) && has_static_columns(*schema, ie)) {
+                // FIXME: If we need to read the static columns and also a
+                // non-full clustering key range, we need to return two byte
+                // ranges in the returned disk_read_range. We don't support
+                // this yet so for now let's fall back to reading the entire
+                // partition which is wasteful but at least correct.
+                // This case should be replaced by correctly adding the static
+                // column's blocks to the return.
+            } else if (ck_ranges.size() == 1) {
+                auto data = ie.get_promoted_index_bytes();
+                // note we already verified above that data.size >= 16
+                sstables::deletion_time deltime;
+                deltime.local_deletion_time = consume_be<uint32_t>(data);
+                deltime.marked_for_delete_at = consume_be<uint64_t>(data);
+                uint32_t num_blocks = consume_be<uint32_t>(data);
+                // We do a linear search on the promoted index. If we were to
+                // look in the same promoted index several times it might have
+                // made sense to build an array of key starts so we can do a
+                // binary search. We could do this once we have a key cache.
+                auto& range_start = ck_ranges[0].start();
+                bool found_range_start = false;
+                uint64_t range_start_pos;
+                auto& range_end = ck_ranges[0].end();

-        auto position = index_list[index_idx].position();
-        return this->data_end_position(summary_idx, index_idx, index_list, pc).then([&key, schema, ck_filtering, this, position, &pc] (uint64_t end) {
-            return sstable_single_streamed_mutation::create(schema, *this, key, ck_filtering, pc, position, end).then([] (auto sm) {
-                return streamed_mutation_opt(std::move(sm));
-            });
+                auto cmp = clustering_key_prefix::tri_compare(*schema);
+                while (num_blocks--) {
+                    if (data.size() < 2) {
+                        // When we break out of this loop, we give up on
+                        // using the promoted index, and fall back to
+                        // reading the entire partition.
+                        // FIXME: this and all other "break" cases below,
+                        // are errors. Log them (with rate limit) and count.
+                        break;
+                    }
+                    uint16_t len = consume_be<uint16_t>(data);
+                    if (data.size() < len) {
+                        break;
+                    }
+                    // The promoted index contains ranges of full column
+                    // names, which may include a clustering key and column.
+                    // But we only need to match the clustering key, because
+                    // we got a clustering key range to search for.
+                    auto start_ck = get_clustering_key(*schema,
+                            consume_bytes(data, len));
+                    if (data.size() < 2) {
+                        break;
+                    }
+                    len = consume_be<uint16_t>(data);
+                    if (data.size() < len) {
+                        break;
+                    }
+                    auto end_ck = get_clustering_key(*schema,
+                            consume_bytes(data, len));
+                    if (data.size() < 16) {
+                        break;
+                    }
+                    uint64_t offset = consume_be<uint64_t>(data);
+                    uint64_t width = consume_be<uint64_t>(data);
+                    if (!found_range_start) {
+                        if (!range_start || cmp(range_start->value(), end_ck) <= 0) {
+                            range_start_pos = ie.position() + offset;
+                            found_range_start = true;
+                        }
+                    }
+                    bool found_range_end = false;
+                    uint64_t range_end_pos;
+                    if (range_end) {
+                        if (cmp(range_end->value(), start_ck) < 0) {
+                            // this block is already past the range_end
+                            found_range_end = true;
+                            range_end_pos = ie.position() + offset;
+                        } else if (cmp(range_end->value(), end_ck) < 0 || num_blocks == 0) {
+                            // range_end is in the middle of this block.
+                            // Note the strict inequality above is important:
+                            // if range_end==end_ck the next block may contain
+                            // still more items matching range_end.
+                            found_range_end = true;
+                            range_end_pos = ie.position() + offset + width;
+                        }
+                    } else if (num_blocks == 0) {
+                        // When !range_end, read until the last block.
+                        // In this case we could have also found the end of
+                        // the partition using the index.
+                        found_range_end = true;
+                        range_end_pos = ie.position() + offset + width;
+                    }
+                    if (found_range_end) {
+                        if (!found_range_start) {
+                            // return empty range
+                            range_start_pos = range_end_pos = 0;
+                        }
+                        return make_ready_future<disk_read_range>(
+                                disk_read_range(range_start_pos, range_end_pos,
+                                        key, deltime));
+                    }
+                }
+            }
+            // Else, if more than one clustering-key range needs to be read,
+            // fall back to reading the entire partition.
+            // FIXME: support multiple ranges, and do not fall back to reading
+            // the entire partition.
+        }
+        // If we're still here there is no promoted index, or we had problems
+        // using it, so just just find the entire partition's range.
+        auto start = ie.position();
+        return this->data_end_position(summary_idx, index_idx, index_list, pc).then([start] (uint64_t end) {
+            return disk_read_range(start, end);
        });
    });
 }
@@ -742,25 +979,31 @@ sstables::sstable::read_row(schema_ptr schema,
 class mutation_reader::impl {
 private:
    schema_ptr _schema;
+    lw_shared_ptr<sstable_data_source> _ds;
+    // For some reason std::function requires functors to be copyable and that's
+    // why we cannot store mp_row_consumer in _get_data_source captured values.
+    // Instead we have this _consumer field here which is moved away by
+    // _get_data_source().
    mp_row_consumer _consumer;
-    std::experimental::optional<data_consume_context> _context;
-    std::function<future<data_consume_context> ()> _get_context;
+    std::function<future<lw_shared_ptr<sstable_data_source>> ()> _get_data_source;
 public:
-    impl(sstable& sst, schema_ptr schema, uint64_t start, uint64_t end,
+    impl(shared_sstable sst, schema_ptr schema, sstable::disk_read_range toread,
         const io_priority_class &pc)
        : _schema(schema)
        , _consumer(schema, query::no_clustering_key_filtering, pc)
-        , _get_context([&sst, this, start, end] {
-            return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer, start, end));
+        , _get_data_source([this, sst = std::move(sst), toread] {
+            auto ds = make_lw_shared<sstable_data_source>(std::move(sst), std::move(_consumer), std::move(toread));
+            return make_ready_future<lw_shared_ptr<sstable_data_source>>(std::move(ds));
        }) { }
-    impl(sstable& sst, schema_ptr schema,
+    impl(shared_sstable sst, schema_ptr schema,
         const io_priority_class &pc)
        : _schema(schema)
        , _consumer(schema, query::no_clustering_key_filtering, pc)
-        , _get_context([this, &sst] {
-            return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer));
+        , _get_data_source([this, sst = std::move(sst)] {
+            auto ds = make_lw_shared<sstable_data_source>(std::move(sst), std::move(_consumer));
+            return make_ready_future<lw_shared_ptr<sstable_data_source>>(std::move(ds));
        }) { }
-    impl(sstable& sst,
+    impl(shared_sstable sst,
         schema_ptr schema,
         std::function<future<uint64_t>()> start,
         std::function<future<uint64_t>()> end,
@@ -768,48 +1011,49 @@ public:
         const io_priority_class& pc)
        : _schema(schema)
        , _consumer(schema, ck_filtering, pc)
-        , _get_context([this, &sst, start = std::move(start), end = std::move(end)] () {
-            return start().then([this, &sst, end = std::move(end)] (uint64_t start) {
-                return end().then([this, &sst, start] (uint64_t end) {
-                    return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer, start, end));
+        , _get_data_source([this, sst = std::move(sst), start = std::move(start), end = std::move(end)] () mutable {
+            return start().then([this, sst = std::move(sst), end = std::move(end)] (uint64_t start) mutable {
+                return end().then([this, sst = std::move(sst), start] (uint64_t end) mutable {
+                    return make_lw_shared<sstable_data_source>(std::move(sst), std::move(_consumer), sstable::disk_read_range{start, end});
                });
            });
        }) { }
-    impl() : _consumer(), _get_context() { }
+    impl() : _get_data_source() { }

    // Reference to _consumer is passed to data_consume_rows() in the constructor so we must not allow move/copy
    impl(impl&&) = delete;
    impl(const impl&) = delete;

    future<streamed_mutation_opt> read() {
-        if (!_get_context) {
+        if (!_get_data_source) {
            // empty mutation reader returns EOF immediately
            return make_ready_future<streamed_mutation_opt>();
        }

-        if (_context) {
+        if (_ds) {
            return do_read();
        }
-        return (_get_context)().then([this] (data_consume_context context) {
-            _context = std::move(context);
+        return (_get_data_source)().then([this] (lw_shared_ptr<sstable_data_source> ds) {
+            _ds = std::move(ds);
            return do_read();
        });
    }
 private:
    future<streamed_mutation_opt> do_read() {
-        return _context->read().then([this] {
-            auto mut = _consumer.get_mutation();
+        return _ds->_context.read().then([this] {
+            auto& consumer = _ds->_consumer;
+            auto mut = consumer.get_mutation();
            if (!mut) {
-                if (_consumer.get_mutation_fragment()) {
+                if (consumer.get_mutation_fragment() || consumer.get_and_reset_is_mutation_end()) {
                    // We are still in the middle of the previous mutation.
-                    _consumer.skip_partition();
+                    consumer.skip_partition();
                    return do_read();
                } else {
                    return make_ready_future<streamed_mutation_opt>();
                }
            }
            auto dk = dht::global_partitioner().decorate_key(*_schema, std::move(mut->key));
-            auto sm = make_streamed_mutation<sstable_streamed_mutation>(_schema, std::move(dk), *_context, _consumer, mut->tomb);
+            auto sm = make_streamed_mutation<sstable_streamed_mutation>(_schema, std::move(dk), mut->tomb, _ds);
            return make_ready_future<streamed_mutation_opt>(std::move(sm));
        });
    }
@@ -825,7 +1069,7 @@ future<streamed_mutation_opt> mutation_reader::read() {
 }

 mutation_reader sstable::read_rows(schema_ptr schema, const io_priority_class& pc) {
-    return std::make_unique<mutation_reader::impl>(*this, schema, pc);
+    return std::make_unique<mutation_reader::impl>(shared_from_this(), schema, pc);
 }

 // Less-comparator for lookups in the partition index.
@@ -938,7 +1182,7 @@ sstable::read_range_rows(schema_ptr schema,
    };

    return std::make_unique<mutation_reader::impl>(
-        *this, std::move(schema), std::move(start), std::move(end), ck_filtering, pc);
+        shared_from_this(), std::move(schema), std::move(start), std::move(end), ck_filtering, pc);
 }


--- a/sstables/row.cc
+++ b/sstables/row.cc
@@ -51,6 +51,7 @@ private:
        RANGE_TOMBSTONE_3,
        RANGE_TOMBSTONE_4,
        RANGE_TOMBSTONE_5,
+        STOP_THEN_ATOM_START,
    } _state = state::ROW_START;

    row_consumer& _consumer;
@@ -62,6 +63,7 @@ private:
    bool _deleted;
    uint32_t _ttl, _expiration;

+    bool _read_partial_row = false;

 public:
    bool non_consuming() const {
@@ -69,6 +71,7 @@ public:
                || (_state == state::CELL_VALUE_BYTES_2)
                || (_state == state::ATOM_START_2)
                || (_state == state::ATOM_MASK_2)
+                || (_state == state::STOP_THEN_ATOM_START)
                || (_state == state::EXPIRING_CELL_3)) && (_prestate == prestate::NONE));
    }

@@ -319,6 +322,9 @@ public:
            }
            break;
        }
+        case state::STOP_THEN_ATOM_START:
+            _state = state::ATOM_START;
+            return row_consumer::proceed::no;
        default:
            throw malformed_sstable_exception("unknown state");
        }
@@ -327,12 +333,42 @@ public:
    }

    data_consume_rows_context(row_consumer& consumer,
-            input_stream<char> && input, uint64_t maxlen) :
-            continuous_data_consumer(std::move(input), maxlen)
-            , _consumer(consumer) {
+            input_stream<char> && input, uint64_t maxlen,
+            std::experimental::optional<sstable::disk_read_range::row_info> ri = {})
+                : continuous_data_consumer(std::move(input), maxlen)
+                , _consumer(consumer) {
+        // If the "ri" option is given, we are reading a partition from the
+        // middle (in the beginning of an atom), as would happen when we use
+        // the "promoted index" to skip closer to where a particular column
+        // starts. When we start in the middle of the partition, we will not
+        // read the key nor the tombstone from the disk, so the caller needs
+        // to provide them (the tombstone is provided in the promoted index
+        // exactly for that reason).
+        if (ri) {
+            _read_partial_row = true;
+            auto ret = _consumer.consume_row_start(ri->k, ri->deltime);
+            if (ret == row_consumer::proceed::yes) {
+                _state = state::ATOM_START;
+            } else {
+                // If we were asked to stop parsing after consuming the row
+                // start, we can't go to ATOM_START, need to use a new state
+                // which stops parsing, and continues at ATOM_START later.
+                _state = state::STOP_THEN_ATOM_START;
+            }
+        }
    }

    void verify_end_state() {
+        if (_read_partial_row) {
+            // If reading a partial row (i.e., when we have a clustering row
+            // filter and using a promoted index), we may be in ATOM_START
+            // state instead of ROW_START. In that case we did not read the
+            // end-of-row marker and consume_row_end() was never called.
+            if (_state == state::ATOM_START) {
+                _consumer.consume_row_end();
+                return;
+            }
+        }
        if (_state != state::ROW_START || _prestate != prestate::NONE) {
            throw malformed_sstable_exception("end of input, but not end of row");
        }
@@ -346,15 +382,18 @@ public:
 // memory in the same time (they are delivered to the consumer one by one).
 class data_consume_context::impl {
 private:
+    shared_sstable _sst;
    std::unique_ptr<data_consume_rows_context> _ctx;
 public:
-    impl(row_consumer& consumer,
-            input_stream<char>&& input, uint64_t maxlen) :
-                _ctx(new data_consume_rows_context(consumer, std::move(input), maxlen)) { }
+    impl(shared_sstable sst, row_consumer& consumer, input_stream<char>&& input, uint64_t maxlen,
+             std::experimental::optional<sstable::disk_read_range::row_info> ri)
+        : _sst(std::move(sst))
+        , _ctx(new data_consume_rows_context(consumer, std::move(input), maxlen, ri))
+    { }
    ~impl() {
        if (_ctx) {
            auto f = _ctx->close();
-            f.handle_exception([ctx = std::move(_ctx)] (auto) { });
+            f.handle_exception([ctx = std::move(_ctx), sst = std::move(_sst)] (auto) { });
        }
    }
    future<> read() {
@@ -376,18 +415,19 @@ future<> data_consume_context::read() {
 }

 data_consume_context sstable::data_consume_rows(
-        row_consumer& consumer, uint64_t start, uint64_t end) {
+        row_consumer& consumer, sstable::disk_read_range toread) {
    // TODO: The second "end - start" below is redundant: The first one tells
    // data_stream() to stop at the "end" byte, which allows optimal read-
    // ahead and avoiding over-read at the end. The second one tells the
    // consumer to stop at exactly the same place, and forces the consumer
    // to maintain its own byte count.
-    return std::make_unique<data_consume_context::impl>(
-            consumer, data_stream(start, end - start, consumer.io_priority()), end - start);
+    return std::make_unique<data_consume_context::impl>(shared_from_this(),
+            consumer, data_stream(toread.start, toread.end - toread.start,
+                consumer.io_priority()), toread.end - toread.start, toread.ri);
 }

 data_consume_context sstable::data_consume_rows(row_consumer& consumer) {
-    return data_consume_rows(consumer, 0, data_size());
+    return data_consume_rows(consumer, {0, data_size()});
 }

 future<> sstable::data_consume_rows_at_once(row_consumer& consumer,
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -31,6 +31,7 @@
 #include "core/do_with.hh"
 #include "core/thread.hh"
 #include <seastar/core/shared_future.hh>
+#include <seastar/core/byteorder.hh>
 #include <iterator>

 #include "types.hh"
@@ -56,6 +57,7 @@

 #include "checked-file-impl.hh"
 #include "disk-error-handler.hh"
+#include "service/storage_service.hh"

 thread_local disk_error_signal_type sstable_read_error;
 thread_local disk_error_signal_type sstable_write_error;
@@ -296,6 +298,12 @@ inline void write(file_writer& out, bytes_view s) {
    out.write(reinterpret_cast<const char*>(s.data()), s.size()).get();
 }

+inline void write(file_writer& out, bytes_ostream s) {
+    for (bytes_view fragment : s) {
+        write(out, fragment);
+    }
+}
+
 // All composite parsers must come after this
 template<typename First, typename... Rest>
 future<> parse(random_access_reader& in, First& first, Rest&&... rest) {
@@ -1066,14 +1074,111 @@ future<> sstable::load() {
    });
 }

+static void output_promoted_index_entry(bytes_ostream& promoted_index,
+        const bytes& first_col,
+        const bytes& last_col,
+        uint64_t offset, uint64_t width) {
+    char s[2];
+    write_be(s, uint16_t(first_col.size()));
+    promoted_index.write(s, 2);
+    promoted_index.write(first_col);
+    write_be(s, uint16_t(last_col.size()));
+    promoted_index.write(s, 2);
+    promoted_index.write(last_col);
+    char q[8];
+    write_be(q, uint64_t(offset));
+    promoted_index.write(q, 8);
+    write_be(q, uint64_t(width));
+    promoted_index.write(q, 8);
+}
+
+// FIXME: use this in write_column_name() instead of repeating the code
+static bytes serialize_colname(const composite& clustering_key,
+        const std::vector<bytes_view>& column_names, composite::eoc marker) {
+    auto c = composite::from_exploded(column_names, marker);
+    auto ck_bview = bytes_view(clustering_key);
+    // The marker is not a component, so if the last component is empty (IOW,
+    // only serializes to the marker), then we just replace the key's last byte
+    // with the marker. If the component however it is not empty, then the
+    // marker should be in the end of it, and we just join them together as we
+    // do for any normal component
+    if (c.size() == 1) {
+        ck_bview.remove_suffix(1);
+    }
+    size_t sz = ck_bview.size() + c.size();
+    if (sz > std::numeric_limits<uint16_t>::max()) {
+        throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
+    }
+    bytes colname(bytes::initialized_later(), sz);
+    std::copy(ck_bview.begin(), ck_bview.end(), colname.begin());
+    std::copy(c.get_bytes().begin(), c.get_bytes().end(), colname.begin() + ck_bview.size());
+    return colname;
+}
+
+// Call maybe_flush_pi_block() before writing the given sstable atom to the
+// output. This may start a new promoted-index block depending on how much
+// data we've already written since the start of the current block. Starting
+// a new block involves both outputting the range of the old block to the
+// index file, and outputting again the currently-open range tombstones to
+// the data file.
+// TODO: currently, maybe_flush_pi_block serializes the column name on every
+// call, saving it in _pi_write.block_last_colname which we need for closing
+// each block, as well as for closing the last block. We could instead save
+// just the unprocessed arguments, and serialize them only when needed at the
+// end of the block. For this we would need this function to take rvalue
+// references (so data is moved in), and need not to use vector of byte_view
+// (which might be gone later).
+void sstable::maybe_flush_pi_block(file_writer& out,
+        const composite& clustering_key,
+        const std::vector<bytes_view>& column_names) {
+    bytes colname = serialize_colname(clustering_key, column_names, composite::eoc::none);
+    if (_pi_write.block_first_colname.empty()) {
+        // This is the first column in the partition, or first column since we
+        // closed a promoted-index block. Remember its name and position -
+        // we'll need to write it to the promoted index.
+        _pi_write.block_start_offset = out.offset();
+        _pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
+        _pi_write.block_first_colname = colname;
+        _pi_write.block_last_colname = std::move(colname);
+    } else if (out.offset() >= _pi_write.block_next_start_offset) {
+        // If we wrote enough bytes to the partition since we output a sample
+        // to the promoted index, output one now and start a new one.
+        output_promoted_index_entry(_pi_write.data,
+                _pi_write.block_first_colname,
+                _pi_write.block_last_colname,
+                _pi_write.block_start_offset - _c_stats.start_offset,
+                out.offset() - _pi_write.block_start_offset);
+        _pi_write.numblocks++;
+        _pi_write.block_start_offset = out.offset();
+        // Because the new block can be read without the previous blocks, we
+        // need to repeat the range tombstones which are still open.
+        // Note that block_start_offset is before outputting those (so the new
+        // block includes them), but we set block_next_start_offset after - so
+        // even if we wrote a lot of open tombstones, we still get a full
+        // block size of new data.
+        if (!clustering_key.empty()) {
+            auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
+                    clustering_key_prefix(clustering_key.values()));
+            for (const auto& rt : rts) {
+                auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
+                auto end = composite::from_clustering_element(*_pi_write.schemap, rt.end);
+                write_range_tombstone(out,
+                        start, rt.start_kind, end, rt.end_kind, {}, rt.tomb);
+            }
+        }
+        _pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
+        _pi_write.block_first_colname = colname;
+        _pi_write.block_last_colname = std::move(colname);
+    } else {
+        // Keep track of the last column in the partition - we'll need it to close
+        // the last block in the promoted index, unfortunately.
+        _pi_write.block_last_colname = std::move(colname);
+    }
+}
+
 // @clustering_key: it's expected that clustering key is already in its composite form.
 // NOTE: empty clustering key means that there is no clustering key.
 void sstable::write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker) {
-    // FIXME: min_components and max_components also keep track of clustering
-    // prefix, so we must merge clustering_key and column_names somehow and
-    // pass the result to the functions below.
-    column_name_helper::min_max_components(_c_stats.min_column_names, _c_stats.max_column_names, column_names);
-
    // was defined in the schema, for example.
    auto c = composite::from_exploded(column_names, marker);
    auto ck_bview = bytes_view(clustering_key);
@@ -1095,8 +1200,6 @@ void sstable::write_column_name(file_writer& out, const composite& clustering_ke
 }

 void sstable::write_column_name(file_writer& out, bytes_view column_names) {
-    column_name_helper::min_max_components(_c_stats.min_column_names, _c_stats.max_column_names, { column_names });
-
    size_t sz = column_names.size();
    if (sz > std::numeric_limits<uint16_t>::max()) {
        throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
@@ -1223,6 +1326,7 @@ void sstable::write_collection(file_writer& out, const composite& clustering_key
    const bytes& column_name = cdef.name();
    write_range_tombstone(out, clustering_key, clustering_key, { bytes_view(column_name) }, mview.tomb);
    for (auto& cp: mview.cells) {
+        maybe_flush_pi_block(out, clustering_key, { column_name, cp.first });
        write_column_name(out, clustering_key, { column_name, cp.first });
        write_cell(out, cp.second);
    }
@@ -1234,11 +1338,27 @@ void sstable::write_clustered_row(file_writer& out, const schema& schema, const
    auto clustering_key = composite::from_clustering_element(schema, clustered_row.key());

    if (schema.is_compound() && !schema.is_dense()) {
+        maybe_flush_pi_block(out, clustering_key, { bytes_view() });
        write_row_marker(out, clustered_row.marker(), clustering_key);
    }
    // Before writing cells, range tombstone must be written if the row has any (deletable_row::t).
    if (clustered_row.tomb()) {
+        maybe_flush_pi_block(out, clustering_key, {});
        write_range_tombstone(out, clustering_key, clustering_key, {}, clustered_row.tomb());
+        // Because we currently may break a partition to promoted-index blocks
+        // in the middle of a clustered row, we also need to track the current
+        // row's tombstone - not just range tombstones - which may effect the
+        // beginning of a new block.
+        // TODO: consider starting a new block only between rows, so the
+        // following code can be dropped:
+        _pi_write.tombstone_accumulator->apply(range_tombstone(
+                clustered_row.key(), bound_kind::incl_start,
+                clustered_row.key(), bound_kind::incl_end, clustered_row.tomb()));
+    }
+
+    if (schema.clustering_key_size()) {
+        column_name_helper::min_max_components(schema, _collector.min_column_names(), _collector.max_column_names(),
+            clustered_row.key().components());
    }

    // Write all cells of a partition's row.
@@ -1256,14 +1376,18 @@ void sstable::write_clustered_row(file_writer& out, const schema& schema, const

        if (schema.is_compound()) {
            if (schema.is_dense()) {
+                maybe_flush_pi_block(out, composite(), { bytes_view(clustering_key) });
                write_column_name(out, bytes_view(clustering_key));
            } else {
+                maybe_flush_pi_block(out, clustering_key, { bytes_view(column_name) });
                write_column_name(out, clustering_key, { bytes_view(column_name) });
            }
        } else {
            if (schema.is_dense()) {
+                maybe_flush_pi_block(out, composite(), { bytes_view(clustered_row.key().get_component(schema, 0)) });
                write_column_name(out, bytes_view(clustered_row.key().get_component(schema, 0)));
            } else {
+                maybe_flush_pi_block(out, composite(), { bytes_view(column_name) });
                write_column_name(out, bytes_view(column_name));
            }
        }
@@ -1282,16 +1406,25 @@ void sstable::write_static_row(file_writer& out, const schema& schema, const row
        assert(column_definition.is_static());
        atomic_cell_view cell = c.as_atomic_cell();
        auto sp = composite::static_prefix(schema);
+        maybe_flush_pi_block(out, sp, { bytes_view(column_definition.name()) });
        write_column_name(out, sp, { bytes_view(column_definition.name()) });
        write_cell(out, cell);
    });
 }

-static void write_index_entry(file_writer& out, disk_string_view<uint16_t>& key, uint64_t pos) {
-    // FIXME: support promoted indexes.
-    uint32_t promoted_index_size = 0;
+static void write_index_header(file_writer& out, disk_string_view<uint16_t>& key, uint64_t pos) {
+    write(out, key, pos);
+}

-    write(out, key, pos, promoted_index_size);
+static void write_index_promoted(file_writer& out, bytes_ostream& promoted_index,
+        deletion_time deltime, uint32_t numblocks) {
+    uint32_t promoted_index_size = promoted_index.size();
+    if (promoted_index_size) {
+        promoted_index_size += 16 /* deltime + numblocks */;
+        write(out, promoted_index_size, deltime, numblocks, promoted_index);
+    } else {
+        write(out, promoted_index_size);
+    }
 }

 static void prepare_summary(summary& s, uint64_t expected_partition_count, uint32_t min_index_interval) {
@@ -1405,6 +1538,18 @@ file_writer components_writer::index_file_writer(sstable& sst, const io_priority
    return file_writer(sst._index_file, std::move(options));
 }

+// Get the currently loaded configuration, or the default configuration in
+// case none has been loaded (this happens, for example, in unit tests).
+static const db::config& get_config() {
+    if (service::get_storage_service().local_is_initialized() &&
+            service::get_local_storage_service().db().local_is_initialized()) {
+        return service::get_local_storage_service().db().local().get_config();
+    } else {
+        static db::config default_config;
+        return default_config;
+    }
+}
+
 components_writer::components_writer(sstable& sst, const schema& s, file_writer& out,
                                     uint64_t estimated_partitions, uint64_t max_sstable_size,
                                     const io_priority_class& pc)
@@ -1413,8 +1558,10 @@ components_writer::components_writer(sstable& sst, const schema& s, file_writer&
    , _out(out)
    , _index(index_file_writer(sst, pc))
    , _max_sstable_size(max_sstable_size)
+    , _tombstone_written(false)
 {
    _sst._filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance());
+    _sst._pi_write.desired_block_size = get_config().column_index_size_in_kb() * 1024;

    prepare_summary(_sst._summary, estimated_partitions, _schema.min_index_interval());

@@ -1435,7 +1582,17 @@ void components_writer::consume_new_partition(const dht::decorated_key& dk) {
    p_key.value = bytes_view(*_partition_key);

    // Write index file entry from partition key into index file.
-    write_index_entry(_index, p_key, _out.offset());
+    // Write an index entry minus the "promoted index" (sample of columns)
+    // part. We can only write that after processing the entire partition
+    // and collecting the sample of columns.
+    write_index_header(_index, p_key, _out.offset());
+    _sst._pi_write.data = {};
+    _sst._pi_write.numblocks = 0;
+    _sst._pi_write.deltime.local_deletion_time = std::numeric_limits<int32_t>::max();
+    _sst._pi_write.deltime.marked_for_delete_at = std::numeric_limits<int64_t>::min();
+    _sst._pi_write.block_start_offset = _out.offset();
+    _sst._pi_write.tombstone_accumulator = range_tombstone_accumulator(_schema, false);
+    _sst._pi_write.schemap = &_schema; // sadly we need this

    // Write partition key into data file.
    write(_out, p_key);
@@ -1461,6 +1618,8 @@ void components_writer::consume(tombstone t) {
    }
    write(_out, d);
    _tombstone_written = true;
+    // TODO: need to verify we don't do this twice?
+    _sst._pi_write.deltime = d;
 }

 stop_iteration components_writer::consume(static_row&& sr) {
@@ -1477,13 +1636,35 @@ stop_iteration components_writer::consume(clustering_row&& cr) {

 stop_iteration components_writer::consume(range_tombstone&& rt) {
    ensure_tombstone_is_written();
+    // Remember the range tombstone so when we need to open a new promoted
+    // index block, we can figure out which ranges are still open and need
+    // to be repeated in the data file. Note that apply() also drops ranges
+    // already closed by rt.start, so the accumulator doesn't grow boundless.
+    _sst._pi_write.tombstone_accumulator->apply(rt);
    auto start = composite::from_clustering_element(_schema, std::move(rt.start));
    auto end = composite::from_clustering_element(_schema, std::move(rt.end));
+    _sst.maybe_flush_pi_block(_out, start, {});
    _sst.write_range_tombstone(_out, std::move(start), rt.start_kind, std::move(end), rt.end_kind, {}, rt.tomb);
    return stop_iteration::no;
 }

 stop_iteration components_writer::consume_end_of_partition() {
+    // If there is an incomplete block in the promoted index, write it too.
+    // However, if the _promoted_index is still empty, don't add a single
+    // chunk - better not output a promoted index at all in this case.
+    if (!_sst._pi_write.data.empty() && !_sst._pi_write.block_first_colname.empty()) {
+        output_promoted_index_entry(_sst._pi_write.data,
+            _sst._pi_write.block_first_colname,
+            _sst._pi_write.block_last_colname,
+            _sst._pi_write.block_start_offset - _sst._c_stats.start_offset,
+            _out.offset() - _sst._pi_write.block_start_offset);
+        _sst._pi_write.numblocks++;
+    }
+    write_index_promoted(_index, _sst._pi_write.data, _sst._pi_write.deltime,
+            _sst._pi_write.numblocks);
+    _sst._pi_write.data = {};
+    _sst._pi_write.block_first_colname = {};
+
    ensure_tombstone_is_written();
    int16_t end_of_row = 0;
    write(_out, end_of_row);
@@ -1491,7 +1672,7 @@ stop_iteration components_writer::consume_end_of_partition() {
    // compute size of the current row.
    _sst._c_stats.row_size = _out.offset() - _sst._c_stats.start_offset;
    // update is about merging column_stats with the data being stored by collector.
-    _sst._collector.update(std::move(_sst._c_stats));
+    _sst._collector.update(_schema, std::move(_sst._c_stats));
    _sst._c_stats.reset();

    if (!_first_key) {
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -113,7 +113,7 @@ class sstable_writer;

 using index_list = std::vector<index_entry>;

-class sstable {
+class sstable : public enable_lw_shared_from_this<sstable> {
 public:
    enum class component_type {
        Index,
@@ -155,6 +155,41 @@ public:
    // object lives until then (e.g., using the do_with() idiom).
    future<> data_consume_rows_at_once(row_consumer& consumer, uint64_t pos, uint64_t end);

+    // disk_read_range describes a byte ranges covering part of an sstable
+    // row that we need to read from disk. Usually this is the whole byte
+    // range covering a single sstable row, but in very large rows we might
+    // want to only read a subset of the atoms which we know contains the
+    // columns we are looking for. When the range to be read does NOT include
+    // the entire row, the caller needs to supply the optional "row_info"
+    // containing information about the entire row (key and deletion time)
+    // which is normally read from the beginning of the row.
+    struct disk_read_range {
+        // TODO: this should become a vector of ranges
+        uint64_t start;
+        uint64_t end;
+        // When the range above does not cover the beginning of the sstable
+        // row, we need to supply information which is only available at the
+        // beginning of the row - the row's key and its tombstone if any.
+        struct row_info {
+            key k;
+            deletion_time deltime;
+        };
+        std::experimental::optional<row_info> ri;
+        disk_read_range() : start(0), end(0) {}
+        disk_read_range(uint64_t start, uint64_t end) :
+            start(start), end(end) { }
+        disk_read_range(uint64_t start, uint64_t end, const key& key, const deletion_time& deltime) :
+            start(start), end(end), ri(row_info{key, deltime}) { }
+        explicit operator bool() const {
+            return start != end;
+        }
+        // found_row() is true if the row was found. This is not the same as
+        // operator bool(): It is possible that found_row() but the promoted
+        // index ruled out anything to read (in this case "ri" was set).
+        bool found_row() const {
+            return start != end || ri;
+        }
+    };

    // data_consume_rows() iterates over rows in the data file from
    // a particular range, feeding them into the consumer. The iteration is
@@ -172,7 +207,7 @@ public:
    // The caller must ensure (e.g., using do_with()) that the context object,
    // as well as the sstable, remains alive as long as a read() is in
    // progress (i.e., returned a future which hasn't completed yet).
-    data_consume_context data_consume_rows(row_consumer& consumer, uint64_t start, uint64_t end);
+    data_consume_context data_consume_rows(row_consumer& consumer, disk_read_range toread);

    // Like data_consume_rows() with bounds, but iterates over whole range
    data_consume_context data_consume_rows(row_consumer& consumer);
@@ -196,6 +231,11 @@ public:
        return _generation;
    }

+    // read_row() reads the entire sstable row (partition) at a given
+    // partition key k, or a subset of this row. The subset is defined by
+    // a filter on the clustering keys which we want to read, which
+    // additionally determines also if all the static columns will also be
+    // returned in the result.
    future<streamed_mutation_opt> read_row(
        schema_ptr schema,
        const key& k,
@@ -372,6 +412,27 @@ private:
    uint64_t _filter_file_size = 0;
    uint64_t _bytes_on_disk = 0;

+    // _pi_write is used temporarily for building the promoted
+    // index (column sample) of one partition when writing a new sstable.
+    struct {
+        // Unfortunately we cannot output the promoted index directly to the
+        // index file because it needs to be prepended by its size.
+        bytes_ostream data;
+        uint32_t numblocks;
+        deletion_time deltime;
+        uint64_t block_start_offset;
+        uint64_t block_next_start_offset;
+        bytes block_first_colname;
+        bytes block_last_colname;
+        std::experimental::optional<range_tombstone_accumulator> tombstone_accumulator;
+        const schema* schemap;
+        size_t desired_block_size;
+    } _pi_write;
+
+    void maybe_flush_pi_block(file_writer& out,
+            const composite& clustering_key,
+            const std::vector<bytes_view>& column_names);
+
    sstring _ks;
    sstring _cf;
    sstring _dir;
@@ -471,6 +532,19 @@ private:
    // The ring_position doesn't have to survive deferring.
    future<uint64_t> upper_bound(schema_ptr, const dht::ring_position&, const io_priority_class& pc);

+    // find_disk_ranges finds the ranges of bytes we need to read from the
+    // sstable to read the desired columns out of the given key. This range
+    // may be the entire byte range of the given partition - as found using
+    // the summary and index files - but if the index contains a "promoted
+    // index" (a sample of column positions for each key) it may be a smaller
+    // range. The returned range may contain columns beyond those requested
+    // in ck_filtering, so it is the reader's duty to use ck_filtering again
+    // when parsing the data read from the returned range.
+    future<disk_read_range> find_disk_ranges(schema_ptr schema,
+            const sstables::key& key,
+            query::clustering_key_filtering_context ck_filtering,
+            const io_priority_class& pc);
+
    future<summary_entry&> read_summary_entry(size_t i);

    // FIXME: pending on Bloom filter implementation
--- a/sstables/types.hh
+++ b/sstables/types.hh
@@ -82,6 +82,10 @@ public:
        return _position;
    }

+    bytes_view get_promoted_index_bytes() const {
+        return to_bytes_view(_promoted_index);
+    }
+
    index_entry(temporary_buffer<char>&& key, uint64_t position, temporary_buffer<char>&& promoted_index)
        : _key(std::move(key)), _position(position), _promoted_index(std::move(promoted_index)) {}

--- a/streamed_mutation.cc
+++ b/streamed_mutation.cc
@@ -21,6 +21,7 @@

 #include <stack>
 #include <boost/range/algorithm/heap_algorithm.hpp>
+#include <seastar/util/defer.hh>

 #include "mutation.hh"
 #include "streamed_mutation.hh"
@@ -116,6 +117,16 @@ std::ostream& operator<<(std::ostream& os, const streamed_mutation& sm) {
    return os;
 }

+std::ostream& operator<<(std::ostream& os, mutation_fragment::kind k)
+{
+    switch (k) {
+    case mutation_fragment::kind::static_row: return os << "static row";
+    case mutation_fragment::kind::clustering_row: return os << "clustering row";
+    case mutation_fragment::kind::range_tombstone: return os << "range tombstone";
+    }
+    abort();
+}
+
 streamed_mutation streamed_mutation_from_mutation(mutation m)
 {
    class reader final : public streamed_mutation::impl {
@@ -129,16 +140,16 @@ streamed_mutation streamed_mutation_from_mutation(mutation m)
            auto& crs = _mutation.partition().clustered_rows();
            auto re = crs.unlink_leftmost_without_rebalance();
            if (re) {
+                auto re_deleter = defer([re] { current_deleter<rows_entry>()(re); });
                _cr = mutation_fragment(std::move(*re));
-                current_deleter<rows_entry>()(re);
            }
        }
        void prepare_next_range_tombstone() {
            auto& rts = _mutation.partition().row_tombstones().tombstones();
            auto rt = rts.unlink_leftmost_without_rebalance();
            if (rt) {
+                auto rt_deleter = defer([rt] { current_deleter<range_tombstone>()(rt); });
                _rt = mutation_fragment(std::move(*rt));
-                current_deleter<range_tombstone>()(rt);
            }
        }
        mutation_fragment_opt read_next() {
@@ -182,6 +193,27 @@ streamed_mutation streamed_mutation_from_mutation(mutation m)
            do_fill_buffer();
        }

+        ~reader() {
+            // After unlink_leftmost_without_rebalance() was called on a bi::set
+            // we need to complete destroying the tree using that function.
+            // clear_and_dispose() used by mutation_partition destructor won't
+            // work properly.
+
+            auto& crs = _mutation.partition().clustered_rows();
+            auto re = crs.unlink_leftmost_without_rebalance();
+            while (re) {
+                current_deleter<rows_entry>()(re);
+                re = crs.unlink_leftmost_without_rebalance();
+            }
+
+            auto& rts = _mutation.partition().row_tombstones().tombstones();
+            auto rt = rts.unlink_leftmost_without_rebalance();
+            while (rt) {
+                current_deleter<range_tombstone>()(rt);
+                rt = rts.unlink_leftmost_without_rebalance();
+            }
+        }
+
        virtual future<> fill_buffer() override {
            do_fill_buffer();
            return make_ready_future<>();
@@ -401,4 +433,5 @@ streamed_mutation reverse_streamed_mutation(streamed_mutation sm) {
    };

    return make_streamed_mutation<reversing_steamed_mutation>(std::move(sm));
-};
+};
+
--- a/streamed_mutation.hh
+++ b/streamed_mutation.hh
@@ -249,6 +249,8 @@ public:
    }
 };

+std::ostream& operator<<(std::ostream&, mutation_fragment::kind);
+
 class position_in_partition {
    int _bound_weight = 0;
    stdx::optional<clustering_key_prefix> _ck;
--- a/tests/database_test.cc
+++ b/tests/database_test.cc
@@ -43,7 +43,13 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
            auto& db = e.local_db();
            auto s = db.find_schema("ks", "cf");
            std::vector<query::partition_range> pranges;
-            for (uint32_t i = 1; i <= 5; ++i) {
+            for (uint32_t i = 1; i <= 3; ++i) {
+                auto pkey = partition_key::from_single_value(*s, to_bytes(sprint("key%d", i)));
+                mutation m(pkey, s);
+                m.partition().apply(tombstone(api::timestamp_type(1), gc_clock::now()));
+                db.apply(s, freeze(m)).get();
+            }
+            for (uint32_t i = 3; i <= 8; ++i) {
                auto pkey = partition_key::from_single_value(*s, to_bytes(sprint("key%d", i)));
                mutation m(pkey, s);
                m.set_clustered_cell(clustering_key_prefix::make_empty(), "v", data_value(bytes("v1")), 1);
@@ -51,9 +57,22 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
                pranges.emplace_back(query::partition_range::make_singular(dht::global_partitioner().decorate_key(*s, std::move(pkey))));
            }

-            auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), 3);
+            {
+                auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), 3);
+                auto result = db.query(s, cmd, query::result_request::only_result, pranges).get0();
+                assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
+            }

            {
+                auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
+                        query::max_rows, gc_clock::now(), std::experimental::nullopt, 5);
+                auto result = db.query(s, cmd, query::result_request::only_result, pranges).get0();
+                assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(5);
+            }
+
+            {
+                auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
+                        query::max_rows, gc_clock::now(), std::experimental::nullopt, 3);
                auto result = db.query(s, cmd, query::result_request::only_result, pranges).get0();
                assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
            }
--- a/Show More
+++ b/Show More